In [1]:
import pyranges as pr
import numpy as np
import pandas as pd
import os
import sys
import time
from timeit import default_timer as timer

In [192]:
# A plus strand transcript
test_ref_tr1 = {"Chromosome": [1]*5,
                "Start": [10,10,100,200,300],
                "End": [340,30,120,220,340],
                "Strand": ["+"]*5,
                "Feature": ["transcript"] +["exon"]*4,
                "gene_id": ["ref_gene_1"]*5,
                "transcript_id": ["ref_tr_1"] * 5,
                "exon_number": [None,1,2,3,4],
                "gene_type": ["protein_coding"] * 5}


pr.from_dict(test_ref_tr1)


Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,340,+,transcript,ref_gene_1,ref_tr_1,,protein_coding
1,1,10,30,+,exon,ref_gene_1,ref_tr_1,1.0,protein_coding
2,1,100,120,+,exon,ref_gene_1,ref_tr_1,2.0,protein_coding
3,1,200,220,+,exon,ref_gene_1,ref_tr_1,3.0,protein_coding
4,1,300,340,+,exon,ref_gene_1,ref_tr_1,4.0,protein_coding


In [3]:
{"Chromosome": [],
                "Start": [],
                "End": [],
                "Strand": [],
                "Feature": [],
                "gene_id": [],
                "transcript_id": []
               }

{'Chromosome': [],
 'Start': [],
 'End': [],
 'Strand': [],
 'Feature': [],
 'gene_id': [],
 'transcript_id': []}

In [193]:
# A minus strand transcript
test_ref_tr2 = {"Chromosome": [2]*4,
                "Start": [10,10,80,100],
                "End": [120,20,90,120],
                "Strand": ["-"]*4,
                "Feature": ["transcript"] + ["exon"]*3,
                "gene_id": ["ref_gene_2"]*4,
                "transcript_id": ["ref_tr_2"]*4,
                "exon_number": [None,3,2,1],
                "gene_type": ["protein_coding"] * 4
               }


test_ref = pr.concat([pr.from_dict(test_ref_tr1), pr.from_dict(test_ref_tr2)])
test_ref

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,340,+,transcript,ref_gene_1,ref_tr_1,,protein_coding
1,1,10,30,+,exon,ref_gene_1,ref_tr_1,1.0,protein_coding
2,1,100,120,+,exon,ref_gene_1,ref_tr_1,2.0,protein_coding
3,1,200,220,+,exon,ref_gene_1,ref_tr_1,3.0,protein_coding
4,1,300,340,+,exon,ref_gene_1,ref_tr_1,4.0,protein_coding
5,2,10,120,-,transcript,ref_gene_2,ref_tr_2,,protein_coding
6,2,10,20,-,exon,ref_gene_2,ref_tr_2,3.0,protein_coding
7,2,80,90,-,exon,ref_gene_2,ref_tr_2,2.0,protein_coding
8,2,100,120,-,exon,ref_gene_2,ref_tr_2,1.0,protein_coding


In [194]:
# Now make test novel transcripts to cover my test cases

# 1. Novel last exons in first intron of annotated transcript (e.g. STMN2)
# For these to pass, they should share an identical 3'end with a first exon of a known transcript

test_novel_fi = {"Chromosome": [1]*6,
                "Start": [10] + [10,50] + [10] + [10,50],
                "End": [70,30,70,70,35,70],
                "Strand": ["+"]*6,
                "Feature": ["transcript"] + ["exon"]*2 + ["transcript"] + ["exon"]*2,
                "gene_id": ["nov_gene_1"]*6,
                "transcript_id": ["nov_tx_fi_p"]*3 + ["nov_tx_fi_f"]*3,
                "exon_number": [None,1,2,None,1,2],
                 "gene_type": ["protein_coding"] * 6
               }

pr.from_dict(test_novel_fi)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,70,+,transcript,nov_gene_1,nov_tx_fi_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_fi_p,1.0,protein_coding
2,1,50,70,+,exon,nov_gene_1,nov_tx_fi_p,2.0,protein_coding
3,1,10,70,+,transcript,nov_gene_1,nov_tx_fi_f,,protein_coding
4,1,10,35,+,exon,nov_gene_1,nov_tx_fi_f,1.0,protein_coding
5,1,50,70,+,exon,nov_gene_1,nov_tx_fi_f,2.0,protein_coding


In [195]:
# 2. Internal intron, spliced in last exon (fully contained within last exon) (e.g. ONECUT1)
# For these to pass, they should match the intron chain of a known transcript up until the penultimate exon

test_novel_si = {"Chromosome": [1]*8,
                "Start": [10,10,100,140] + [50,50,100,140],
                "End": [160,30,120,160] + [160,70,120,160],
                "Strand": ["+"]*8,
                "Feature": ["transcript"] + ["exon"]*3 + ["transcript"] + ["exon"]*3,
                "gene_id": ["nov_gene_1"]*8,
                "transcript_id": ["nov_tx_si_p"]*4 + ["nov_tx_si_f"]*4,
                "exon_number": [None,1,2,3,None,1,2,3],
                 "gene_type": ["protein_coding"] * 8
               }

pr.from_dict(test_novel_si)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,160,+,transcript,nov_gene_1,nov_tx_si_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_si_p,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,nov_tx_si_p,2.0,protein_coding
3,1,140,160,+,exon,nov_gene_1,nov_tx_si_p,3.0,protein_coding
4,1,50,160,+,transcript,nov_gene_1,nov_tx_si_f,,protein_coding
5,1,50,70,+,exon,nov_gene_1,nov_tx_si_f,1.0,protein_coding
6,1,100,120,+,exon,nov_gene_1,nov_tx_si_f,2.0,protein_coding
7,1,140,160,+,exon,nov_gene_1,nov_tx_si_f,3.0,protein_coding


In [196]:
# 3. Internal intron bleedthrough (e.g. SIN3B)
# For these events to pass, they should match the intron chain of a known transcript up until the penultimate exon

test_novel_bl = {"Chromosome": [1]*8,
                "Start": [10,10,100,200] + [10,50,100,200],
                "End": [240,30,120,240] + [240,70,120,240],
                "Strand": ["+"]*8,
                "Feature": ["transcript"] + ["exon"]*3 + ["transcript"] + ["exon"]*3,
                "gene_id": ["nov_gene_1"]*8,
                "transcript_id": ["nov_tx_bl_p"]*4 + ["nov_tx_bl_f"]*4,
                "exon_number": [None,1,2,3,None,1,2,3],
                 "gene_type": ["protein_coding"] * 8
               }

pr.from_dict(test_novel_bl)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,240,+,transcript,nov_gene_1,nov_tx_bl_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_bl_p,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,nov_tx_bl_p,2.0,protein_coding
3,1,200,240,+,exon,nov_gene_1,nov_tx_bl_p,3.0,protein_coding
4,1,10,240,+,transcript,nov_gene_1,nov_tx_bl_f,,protein_coding
5,1,50,70,+,exon,nov_gene_1,nov_tx_bl_f,1.0,protein_coding
6,1,100,120,+,exon,nov_gene_1,nov_tx_bl_f,2.0,protein_coding
7,1,200,240,+,exon,nov_gene_1,nov_tx_bl_f,3.0,protein_coding


In [197]:
# 4. Internal intron with novel internal and terminal exon (e.g.)
# For these events to pass, they should match the intron chain of a known transcript,
# but have a, continuous chain of length n of novel events at the 3'end of the transcript
# (n can be varied)
# Event know from NP is fully contained within annotated intron - also set this constraint5

test_novel_mult = {"Chromosome": [1]*5, 
                   "Start": [10,10,100,130,150],
                   "End": [160,30,120,140,160],
                   "Strand": ["+"]*5,
                   "Feature": ["transcript"] + ["exon"]*4,
                   "gene_id": ["nov_gene_1"]*5,
                   "transcript_id": ["nov_tx_mult_p"]*5,
                   "exon_number": [None,1,2,3,4],
                   "gene_type": ["protein_coding"] * 5
                    }

pr.from_dict(test_novel_mult)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,160,+,transcript,nov_gene_1,nov_tx_mult_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_mult_p,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,nov_tx_mult_p,2.0,protein_coding
3,1,130,140,+,exon,nov_gene_1,nov_tx_mult_p,3.0,protein_coding
4,1,150,160,+,exon,nov_gene_1,nov_tx_mult_p,4.0,protein_coding


In [198]:
# 5. 3'UTR intron fully contained within an annotated 3'UTR (e.g. TDP-43)
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
# (Annotate as a 3'UTR intron (spliced out) after filtering for intron chain match)
test_novel_3ui = {"Chromosome": [1]*6,
                "Start": [10,10,100,200,300,330],
                "End": [340,30,120,220,310,340],
                "Strand": ["+"]*6,
                "Feature": ["transcript"] + ["exon"]*5,
                "gene_id": ["nov_gene_1"]*6,
                "transcript_id": ["nov_tx_3ui_p"]*6,
                  "exon_number": [None,1,2,3,4,5],
                  "gene_type": ["protein_coding"] * 6
                 }

pr.from_dict(test_novel_3ui)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,340,+,transcript,nov_gene_1,nov_tx_3ui_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_3ui_p,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,nov_tx_3ui_p,2.0,protein_coding
3,1,200,220,+,exon,nov_gene_1,nov_tx_3ui_p,3.0,protein_coding
4,1,300,310,+,exon,nov_gene_1,nov_tx_3ui_p,4.0,protein_coding
5,1,330,340,+,exon,nov_gene_1,nov_tx_3ui_p,5.0,protein_coding


In [200]:
#6. Distal last exon spliced from penultimate exon (i.e. a mutually exclusive last exon) (e.g. SMC1A)
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
# Will have to annotate more precisely later (i.e. differentiate from 7)

test_novel_exc_dist = {"Chromosome": [1]*5,
                "Start": [10,10,100,200,360],
                "End": [380,30,120,220,380],
                "Strand": ["+"]*5,
                "Feature": ["transcript"] + ["exon"]*4,
                "gene_id": ["nov_gene_1"]*5,
                "transcript_id": ["nov_tx_exc_dist_p"]*5,
                       "exon_number": [None,1,2,3,4],
                       "gene_type": ["protein_coding"] * 5
               }

pr.from_dict(test_novel_exc_dist)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,380,+,transcript,nov_gene_1,nov_tx_exc_dist_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_exc_dist_p,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,nov_tx_exc_dist_p,2.0,protein_coding
3,1,200,220,+,exon,nov_gene_1,nov_tx_exc_dist_p,3.0,protein_coding
4,1,360,380,+,exon,nov_gene_1,nov_tx_exc_dist_p,4.0,protein_coding


In [201]:
#7. Distal last exon spliced from ann
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
test_novel_le_dist = {"Chromosome": [1]*6,
                      "Start": [10,10,100,200,300,360],
                      "End": [380,30,120,220,310,380],
                      "Strand": ["+"]*6,
                      "Feature": ["transcript"] + ["exon"]*5,
                      "gene_id": ["nov_gene_1"]*6,
                      "transcript_id": ["nov_tx_le_dist_p"]*6,
                      "exon_number": [None,1,2,3,4,5],
                      "gene_type": ["protein_coding"] * 6
                     }

pr.from_dict(test_novel_le_dist)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,380,+,transcript,nov_gene_1,nov_tx_le_dist_p,,protein_coding
1,1,10,30,+,exon,nov_gene_1,nov_tx_le_dist_p,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,nov_tx_le_dist_p,2.0,protein_coding
3,1,200,220,+,exon,nov_gene_1,nov_tx_le_dist_p,3.0,protein_coding
4,1,300,310,+,exon,nov_gene_1,nov_tx_le_dist_p,4.0,protein_coding
5,1,360,380,+,exon,nov_gene_1,nov_tx_le_dist_p,5.0,protein_coding


In [203]:
#8. Distal last exon spliced from annoatrd (minus strand)
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
test_novel_le_dist_minus = {"Chromosome": [2]*4,
                      "Start": [40,40,80,100],
                      "End": [120,50,90,120],
                      "Strand": ["-"]*4,
                      "Feature": ["transcript"] + ["exon"]*3,
                      "gene_id": ["nov_gene_2"]*4,
                      "transcript_id": ["nov_tx_le_dist_p_minus"]*4,
                            "exon_number": [None,1,2,3],
                            "gene_type": ["protein_coding"] * 4
                     }

pr.from_dict(test_novel_le_dist_minus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,2,40,120,-,transcript,nov_gene_2,nov_tx_le_dist_p_minus,,protein_coding
1,2,40,50,-,exon,nov_gene_2,nov_tx_le_dist_p_minus,1.0,protein_coding
2,2,80,90,-,exon,nov_gene_2,nov_tx_le_dist_p_minus,2.0,protein_coding
3,2,100,120,-,exon,nov_gene_2,nov_tx_le_dist_p_minus,3.0,protein_coding


In [204]:
#9 3'UTR extension (plus & minus strand)
# Expect a complete intron chain match, just that last exon 3'end is downstream of annotated 3'end
#A - Plus strand
test_novel_3utr_ext_plus = {"Chromosome": [1]*5,
                "Start": [10,10,100,200,300],
                "End": [350,30,120,220,350],
                "Strand": ["+"]*5,
                "Feature": ["transcript"] + ["exon"]*4,
                "gene_id": ["nov_gene_1"]*5,
                "transcript_id": ["novel_tx_3utr_ext_p_plus"] * 5,
                "exon_number": [None,1,2,3,4],
                           "gene_type": ["protein_coding"] * 5}

pr.from_dict(test_novel_3utr_ext_plus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,350,+,transcript,nov_gene_1,novel_tx_3utr_ext_p_plus,,protein_coding
1,1,10,30,+,exon,nov_gene_1,novel_tx_3utr_ext_p_plus,1.0,protein_coding
2,1,100,120,+,exon,nov_gene_1,novel_tx_3utr_ext_p_plus,2.0,protein_coding
3,1,200,220,+,exon,nov_gene_1,novel_tx_3utr_ext_p_plus,3.0,protein_coding
4,1,300,350,+,exon,nov_gene_1,novel_tx_3utr_ext_p_plus,4.0,protein_coding


In [205]:
#B - Minus strand
test_novel_3utr_ext_minus = {"Chromosome": [2]*4,
                "Start": [5,5,80,100],
                "End": [120,20,90,120],
                "Strand": ["-"]*4,
                "Feature": ["transcript"] + ["exon"]*3,
                "gene_id": ["nov_gene_2"]*4,
                "transcript_id": ["novel_tx_3utr_ext_minus"]*4,
                "exon_number": [None,3,2,1],
                             "gene_type": ["protein_coding"] * 4
               }

pr.from_dict(test_novel_3utr_ext_minus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,2,5,120,-,transcript,nov_gene_2,novel_tx_3utr_ext_minus,,protein_coding
1,2,5,20,-,exon,nov_gene_2,novel_tx_3utr_ext_minus,3.0,protein_coding
2,2,80,90,-,exon,nov_gene_2,novel_tx_3utr_ext_minus,2.0,protein_coding
3,2,100,120,-,exon,nov_gene_2,novel_tx_3utr_ext_minus,1.0,protein_coding


In [206]:
# 10. Bleedthrough but on minus strand
# Made one b4 but as doing filtering of complete match isoforms
# want to also cover the minus strand scenario (where look for start smaller than ref start )
test_novel_bl_minus = {"Chromosome": [2]*3,
                "Start": [70,70,100],
                "End": [120,90,120],
                "Strand": ["-"]*3,
                "Feature": ["transcript"] + ["exon"]*2,
                "gene_id": ["nov_gene_2"]*3,
                "transcript_id": ["novel_bl_p_minus"]*3,
                "exon_number": [None,2,1],
                       "gene_type": ["protein_coding"] * 3
               }

pr.from_dict(test_novel_bl_minus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,2,70,120,-,transcript,nov_gene_2,novel_bl_p_minus,,protein_coding
1,2,70,90,-,exon,nov_gene_2,novel_bl_p_minus,2.0,protein_coding
2,2,100,120,-,exon,nov_gene_2,novel_bl_p_minus,1.0,protein_coding


In [207]:
# 11. first intron spliced in event on minus strand

test_novel_fi_minus = {"Chromosome": [2]*3,
                "Start": [93,93,100],
                "End": [120,96,120],
                "Strand": ["-"]*3,
                "Feature": ["transcript"] + ["exon"]*2,
                "gene_id": ["nov_gene_2"]*3,
                "transcript_id": ["novel_fi_p_minus"]*3,
                "exon_number": [None,2,1],
                       "gene_type": ["protein_coding"] * 3
               }

pr.from_dict(test_novel_fi_minus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,2,93,120,-,transcript,nov_gene_2,novel_fi_p_minus,,protein_coding
1,2,93,96,-,exon,nov_gene_2,novel_fi_p_minus,2.0,protein_coding
2,2,100,120,-,exon,nov_gene_2,novel_fi_p_minus,1.0,protein_coding


In [208]:
# 12. 'any' complete match, but matching reference intron for novel's first intron is not a first intron 
test_novel_any_not_fi_plus = {"Chromosome": [1]*4,
                              "Start": [100,100,200,260],
                              "End": [280,120,220,280],
                              "Strand": ["+"]*4,
                              "Feature": ["transcript"] + ["exon"]*3,
                              "gene_id": ["nov_gene_1"]*4,
                              "transcript_id": ["novel_tx_any_not_first_intron_plus"] * 4,
                              "exon_number": [None,1,2,3],
                              "gene_type": ["protein_coding"] * 4
                             }


pr.from_dict(test_novel_any_not_fi_plus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,100,280,+,transcript,nov_gene_1,novel_tx_any_not_first_intron_plus,,protein_coding
1,1,100,120,+,exon,nov_gene_1,novel_tx_any_not_first_intron_plus,1.0,protein_coding
2,1,200,220,+,exon,nov_gene_1,novel_tx_any_not_first_intron_plus,2.0,protein_coding
3,1,260,280,+,exon,nov_gene_1,novel_tx_any_not_first_intron_plus,3.0,protein_coding


In [209]:
# 12B - minus
test_novel_any_not_fi_minus =  {"Chromosome": [2]*4,
                "Start": [4,4,10,80],
                "End": [90,6,20,90],
                "Strand": ["-"]*4,
                "Feature": ["transcript"] + ["exon"]*3,
                "gene_id": ["nov_gene_2"]*4,
                "transcript_id": ["novel_tx_any_not_first_intron_minus"]*4,
                "exon_number": [None,3,2,1],
                                "gene_type": ["protein_coding"] * 4
               }

pr.from_dict(test_novel_any_not_fi_minus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,2,4,90,-,transcript,nov_gene_2,novel_tx_any_not_first_intron_minus,,protein_coding
1,2,4,6,-,exon,nov_gene_2,novel_tx_any_not_first_intron_minus,3.0,protein_coding
2,2,10,20,-,exon,nov_gene_2,novel_tx_any_not_first_intron_minus,2.0,protein_coding
3,2,80,90,-,exon,nov_gene_2,novel_tx_any_not_first_intron_minus,1.0,protein_coding


In [212]:
test_reass_ref_tr = {"Chromosome": [1]*5,
                "Start": [10,10,100,200,300],
                "End": [340,30,120,220,340],
                "Strand": ["+"]*5,
                "Feature": ["transcript"] +["exon"]*4,
                "gene_id": ["novel_gene_1"]*5,
                "transcript_id": ["reass_ref_tr_1"] * 5,
                "exon_number": [None,1,2,3,4],
                "gene_type": ["protein_coding"] * 5}

pr.from_dict(test_reass_ref_tr)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id,exon_number,gene_type
0,1,10,340,+,transcript,novel_gene_1,reass_ref_tr_1,,protein_coding
1,1,10,30,+,exon,novel_gene_1,reass_ref_tr_1,1.0,protein_coding
2,1,100,120,+,exon,novel_gene_1,reass_ref_tr_1,2.0,protein_coding
3,1,200,220,+,exon,novel_gene_1,reass_ref_tr_1,3.0,protein_coding
4,1,300,340,+,exon,novel_gene_1,reass_ref_tr_1,4.0,protein_coding


In [213]:
test_novel_gr = pr.concat([pr.from_dict(event) for event in [test_novel_3ui,
                                                            test_novel_bl,
                                                            test_novel_exc_dist,
                                                            test_novel_fi,
                                                            test_novel_le_dist,
                                                            test_novel_mult,
                                                            test_novel_si,
                                                            test_novel_le_dist_minus,
                                                             test_novel_3utr_ext_plus,
                                                             test_novel_3utr_ext_minus,
                                                             test_novel_bl_minus,
                                                             test_novel_fi_minus,
                                                             test_novel_any_not_fi_plus,
                                                             test_novel_any_not_fi_minus,
                                                             test_reass_ref_tr
                                                            ]
                          ]
                         )

test_novel_gr.Feature

0     transcript
1           exon
2           exon
3           exon
4           exon
         ...    
13          exon
14    transcript
15          exon
16          exon
17          exon
Name: Feature, Length: 76, dtype: object

In [214]:
test_novel_gr.to_gtf("../tests/novel.test_intron_chain_matching.transcripts.gtf")
test_ref.to_gtf("../tests/reference.test_intron_chain_matching.transcripts.gtf")

In [15]:
def introns_from_df(df):
    '''
    '''
    
    n_exons = len(df)
    
    if n_exons < 2:
        return None
        #print(df)
        #raise Exception("at least two exons are required for transcript to have an intron")
    # n exons = n-1 introns
    
    strand = df["Strand"].drop_duplicates().tolist()[0]
#     print(strand)
    chrom = df["Chromosome"].drop_duplicates().tolist()[0]
    gene_id = df["gene_id"].drop_duplicates().tolist()[0]
    tx_id = df["transcript_id"].drop_duplicates().tolist()[0]
    feature = "intron"
    introns = {}
    for i in range(0, n_exons - 1):
        if strand == "+":
            intron_start = df.iloc[i, lambda x: x.columns.get_loc("End")]
            intron_end = df.iloc[i+1, lambda x: x.columns.get_loc("Start")]
            introns[str(i)] = {"Chromosome": chrom,
                               "Start": intron_start,
                               "End": intron_end,
                               "Strand": strand,
                               "Feature": feature,
                               "gene_id": gene_id,
                               "transcript_id": tx_id}
        elif strand == "-":
            intron_start = df.iloc[i, lambda x: x.columns.get_loc("End")]
            intron_end = df.iloc[i+1, lambda x: x.columns.get_loc("Start")]
            introns[str(i)] = {"Chromosome": chrom,
                               "Start": intron_start,
                               "End": intron_end,
                               "Strand": strand,
                               "Feature": feature,
                               "gene_id": gene_id,
                               "transcript_id": tx_id}
    return pd.DataFrame.from_dict(introns, orient = "index")
        


def introns_by_tx(gr, by="transcript_id", nb_cpu=1):
    '''
    '''
    # Sort by position (for safety)
    gr = gr.sort()
    
    return gr.apply(lambda df: df.groupby(by).apply(introns_from_df), nb_cpu=nb_cpu)
    
    
test_novel_introns = introns_by_tx(test_novel_gr)
test_ref_introns = introns_by_tx(test_ref)
    
    
test_novel_introns

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,30,100,+,intron,nov_gene_1,nov_tx_3ui_p
1,1,120,200,+,intron,nov_gene_1,nov_tx_3ui_p
2,1,220,300,+,intron,nov_gene_1,nov_tx_3ui_p
3,1,310,330,+,intron,nov_gene_1,nov_tx_3ui_p
4,1,70,100,+,intron,nov_gene_1,nov_tx_bl_f
5,1,120,200,+,intron,nov_gene_1,nov_tx_bl_f
6,1,30,100,+,intron,nov_gene_1,nov_tx_bl_p
7,1,120,200,+,intron,nov_gene_1,nov_tx_bl_p
8,1,30,100,+,intron,nov_gene_1,nov_tx_exc_dist_p
9,1,120,200,+,intron,nov_gene_1,nov_tx_exc_dist_p


In [16]:
test_ref_introns

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,30,100,+,intron,ref_gene_1,ref_tr_1
1,1,120,200,+,intron,ref_gene_1,ref_tr_1
2,1,220,300,+,intron,ref_gene_1,ref_tr_1
3,2,20,80,-,intron,ref_gene_2,ref_tr_2
4,2,90,100,-,intron,ref_gene_2,ref_tr_2


In [17]:
def rle(inarray):
        """
        run length encoding. Partial credit to R rle function. 
        Multi datatype arrays catered for including non Numpy
        returns: tuple (runlengths, startpositions, values)
        https://stackoverflow.com/questions/1066758/find-length-of-sequences-of-identical-values-in-a-numpy-array-run-length-encodi
        Thomas Browne
        """
        ia = np.asarray(inarray)                # force numpy
        n = len(ia)
        if n == 0: 
            return (None, None, None)
        else:
            y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
            i = np.append(np.where(y), n - 1)   # must include last element posi
            z = np.diff(np.append(-1, i))       # run lengths
            p = np.cumsum(np.append(0, z))[:-1] # positions
            return(z, p, ia[i])

In [None]:
# Dict of {novel_tx_id: {matches: [ref_id], chain_match: [n_matching], terminal_non_match: [n_not_matching]}}
novel_info_dict = {}

test_novel_gr.apply(lambda df: df.groupby("transcript_id").filter(lambda grp: len(grp) > 2))

for key, dfs in pr.itergrs([test_novel_introns.apply(lambda df: df.groupby("transcript_id").filter(lambda grp: len(grp) > 1)), test_ref_introns], strand=True, keys=True):
    print("----processing chrom and strand pair {0} & {1}".format(key[0], key[1]))
  
    by_tx = tuple([df.groupby("transcript_id") for df in dfs])
    novel_txipts = by_tx[0]
    ref_txipts = by_tx[1]
    
    strand = key[1]
#     print(strand)
#    #1. for each novel transcript, test it against all ref transcripts:
    
    
    for novel_id, novel_introns in novel_txipts:
#         print(novel_id)
#         print(novel_introns)
#         print("class of novel_introns is {}".format(type(novel_introns)))
        #print(exons)
        
        if strand == "-":
            #First in order = last intron - reverse df now
            novel_introns = novel_introns[::-1].reset_index(drop=True)
#             ref_introns = ref_introns[::-1].reset_index(drop=True)
            #print(exons)
        #by_tx[1]
        else:
            pass
        
        
        if strand == "+":
            first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                            ref_introns.head(1)[["Start","End"]]) for ref_id,ref_introns in ref_txipts]
        else:
            first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                            ref_introns[::-1].reset_index(drop=True) # rev order
                                            .head(1)[["Start","End"]]
                                           )
                             for ref_id,ref_introns in ref_txipts]

        #print(novel_id)
#         print(first_matches)
        if not sum(first_matches) > 0:
            print("{0} does not match any reference transcripts in its first intron. Skipping".format(novel_id))
            continue
        
        # Now compare novel transcript against each ref transcript with match in first intron.
        # 
        for ref_tr, first_match in zip(ref_txipts, first_matches):
            
            if not first_match:
                continue
            #intron_chain_match = np.equal(np.asarray(exons[["Start","End"]]), np.asarray(ref_tr[1][["Start","End"]]))
            ref_introns = ref_tr[1]
            
            if strand == "-":
                ref_introns = ref_introns[::-1].reset_index(drop=True) # rev order
            
            n_novel_introns = len(novel_introns)
            n_ref_introns = len(ref_introns)
            
            # To avoid a slicing error for txipts shorter than novel
            max_chain = min(n_novel_introns, n_ref_introns)
#             print(max_chain)
#             print(ref_tr[1].iloc[0,])
#             print(n_novel_introns)
#             print(len(ref_tr[1]))
            novel_chain_match = pd.DataFrame([(novel_introns.iloc[i,:][["Start","End"]]
                                               .eq(ref_introns.iloc[i,:][["Start","End"]]
                                                  )
                                              )
                                              for i in range(max_chain)])
    
            # Collapse to single True/False per row - does intron match?
            novel_chain_match = novel_chain_match.apply(np.all, axis=1, raw=True)
            
#             print(novel_chain_match)
            
            runs, starts, vals = rle(novel_chain_match)
            #print(runs[0])
            #print(starts)
            #print("\n {0}".format(np.where(vals == False)))
            
            if np.all(vals) or np.array_equal(vals, [True,False]):
                # Possible genuine match, should update dict with info
                
                # match (from start of ref Txipt)
                
                if vals.size == 1:
                    #i.e. all true/introns match (e.g. bleedthrough/ where )
                    terminal_non_match = 0
                else:
                    terminal_non_match = runs[1]
                
#                 print(terminal_non_match)
                
                if novel_id not in novel_info_dict:
                    
                    novel_info_dict[novel_id] = {"matches": [ref_tr[0]],
                                                 "chain_match": [runs[0]], #Always starts with true, so take length of true
                                                 "terminal_non_match": [terminal_non_match],
                                                }
                
                else:
                    # Append to dict
                    novel_info_dict[novel_id]["matches"].append(ref_tr[0])
                    novel_info_dict[novel_id]["chain_match"].append(ref_tr[0])
                    novel_info_dict[novel_id]["terminal_non_match"].append(ref_tr[0])
                    
                
            else:
                continue
            
            
            
print(novel_info_dict)
#             intron_chain_match = exons[["Start","End"]].eq(ref_tr[1][["Start","End"]])
#             print("\n")
#             print(intron_chain_match)
#             print(intron_chain_match.apply(np.all, axis = 1))
            
            
                #loc[np.where(lambda x: (x[["Start", "End"]].all()))])
                                                  #apply(lambda x: all(x), axis = "index", raw = True))
#             print(intron_chain_match)
            
            
#         first_intron = exons.head(1)[["Start","End"]] if strand == "+" else exons.tail(1)[["Start","End"]]
        
#         grp_filter_matches = by_tx[1].filter(lambda x: np.array_equal(first_intron, x.head(1)[["Start", "End"]]))
#         print(grp_filter_matches)
#         print(type(grp_filter_matches))
     #for novel, ref_tr in zip(by_tx[0],by_tx[1]:
#         print(nov_tr[1])
#         print(ref_tr)
             
pd.concat({key: pd.DataFrame.from_dict(d, orient = "columns") for key, d in novel_info_dict.items()}, axis=0)

In [None]:
def match_intron_chains(novel_gr, ref_gr, id_col = "transcript_id", nb_cpu = 1):
    '''
    grs should contain introns
    '''
    # {novel_id: {matches: [ref_id], chain_match: [n], terminal_non_match: [n]}}
    match_info_dict = {}
    

    
    for key, dfs in pr.itergrs([novel_gr, ref_gr], strand=True, keys=True):
        print("----processing chrom and strand pair {0} & {1}".format(key[0], key[1]))
        # dfs = novel_gr & ref_gr matched by key (chromosome & strand)
        # pandas group by objects of (transcript_id, df)
        by_tx = tuple([df.groupby("transcript_id") for df in dfs])
        
        novel_txipts = by_tx[0]
        ref_txipts = by_tx[1]
        
        # Pyranges keys are tuples of (chr,strand)
        strand = key[1]
        
        
        #Comparing each novel transcript against all ref transcripts
        for novel_id, novel_introns in novel_txipts:
        
            if strand == "-":
                # Standard PyRanges sort - First in df = last intron (smallest values i.e. leftmost)
                # Reverse so first row in df is always the first intron
                # is reset_index necessary?
                novel_introns = novel_introns[::-1].reset_index(drop=True)
            
            else:
                pass
        
            # As a first pass, check for matches in intron chain of first introns between novel and reference Txs
            if strand == "+":
                first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                                ref_introns.head(1)[["Start","End"]]) 
                                 for ref_id,ref_introns in ref_txipts]
            else:
                first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                                ref_introns[::-1].reset_index(drop=True) # rev order so 1st row = 1st intron
                                                .head(1)[["Start","End"]]
                                               )
                                 for ref_id,ref_introns in ref_txipts]


            if not sum(first_matches) > 0:
                print("{0} does not match any reference transcripts in its first intron. Skipping".format(novel_id))
                continue
        
            # Compare full introns chains of novel transcript against each ref transcript with match in first intron.
        
            for ref_tr, first_match in zip(ref_txipts, first_matches):
            
                if not first_match:
                    continue
                
            
                ref_id = ref_tr[0]
#                 print("ref_id {}".format(ref_id))
                ref_introns = ref_tr[1]
            
                if strand == "-":
                    ref_introns = ref_introns[::-1].reset_index(drop=True) # reverse so first row = first intron
                        
                # To avoid a slicing error for ref txipts shorter than novel
                n_novel_introns = len(novel_introns)
                n_ref_introns = len(ref_introns)
                max_chain = min(n_novel_introns, n_ref_introns)
#             print(max_chain)
#             print(ref_tr[1].iloc[0,])
#             print(n_novel_introns)
#             print(len(ref_tr[1]))

                # Row-wise, check whether match with corresponding intron of reference transcript
                novel_chain_match = pd.DataFrame([(novel_introns.iloc[i,:][["Start","End"]]
                                                   .eq(ref_introns.iloc[i,:][["Start","End"]]
                                                      )
                                                  )
                                                  for i in range(max_chain)])
    
                # Collapse to single True/False per row - does intron completely match?
                novel_chain_match = novel_chain_match.apply(np.all, axis=1, raw=True)
            
#             print(novel_chain_match)
            
                runs, starts, vals = rle(novel_chain_match)
            #print(runs[0])
            #print(starts)
            #print("\n {0}".format(np.where(vals == False)))
            
                # Considered a valid match if a intron chain completely identical 
                # or matches at beginning of txipt but differs at the end
                if np.all(vals) or np.array_equal(vals, [True,False]):
                
                    # Don't want to throw away all valid matches (yet)
                    # Possible genuine match, should update dict with info
                    # match (from start of ref Txipt)
                
                    if vals.size == 1:
                        #i.e. all true/introns match (e.g. bleedthrough)
                        terminal_non_match = 0
                    else:
                        # All valid = consective match & non-match, so non-match = 2nd in array 
                        terminal_non_match = runs[1]
                
#                 print(terminal_non_match)
                
                    if novel_id not in match_info_dict:
                    
                        match_info_dict[novel_id] = {"matches": [ref_id],
                                                     "chain_match": [runs[0]], #Always starts with true, so take length of true
                                                     "terminal_non_match": [terminal_non_match],
                                                    }
                
                    else:
                        # Append to dict
                        match_info_dict[novel_id]["matches"].append(ref_id)
                        match_info_dict[novel_id]["chain_match"].append(runs[0])
                        match_info_dict[novel_id]["terminal_non_match"].append(terminal_non_match)
                    
                
                else:
                    continue
    
    # Output df for easier parsing
#     match_df = pd.concat({key: pd.DataFrame.from_dict(d, orient = "columns") for key, d in match_info_dict.items()}, axis=0)
    
    return match_info_dict
        


In [None]:
match_intron_chains(test_novel_introns, test_ref_introns)

In [None]:
np.array_equal(np.array([True, False]), [True, False])
np.all([True,True])

In [None]:
# test_introns = pr.from_dict(test_novel_3ui)#.subset(lambda df: df["transcript_id"] == "nov_tx_3ui_p")

# n_exons = len(test_introns)
# # n exons = n -1 introns

# # plus strand txipt - End of first exon = start of intron, Start of 3'exon = end coord

# test_introns.as_df().iloc[0,lambda x: x.columns.get_loc("End")]

# def introns_from_df(df):
#     '''
#     '''
    
#     n_exons = len(df)
    
#     if n_exons < 2:
#         raise Exception("at least two exons are required for transcript to have an intron")
#     # n exons = n-1 introns
    
#     strand = df["Strand"].drop_duplicates().tolist()[0]
# #     print(strand)
#     chrom = df["Chromosome"].drop_duplicates().tolist()[0]
#     gene_id = df["gene_id"].drop_duplicates().tolist()[0]
#     tx_id = df["transcript_id"].drop_duplicates().tolist()[0]
#     feature = "intron"
#     introns = {}
#     for i in range(0, n_exons - 1):
#         if strand == "+":
#             intron_start = df.iloc[i, lambda x: x.columns.get_loc("End")]
#             intron_end = df.iloc[i+1, lambda x: x.columns.get_loc("Start")]
#             introns[str(i)] = {"Chromosome": chrom,
#                                "Start": intron_start,
#                                "End": intron_end,
#                                "Strand": strand,
#                                "Feature": feature,
#                                "gene_id": gene_id,
#                                "transcript_id": tx_id}
#         elif strand == "-":
#             intron_end = df.iloc[i, lambda x: x.columns.get_loc("Start")]
#             intron_start = df.iloc[i+1, lambda x: x.columns.get_loc("End")]
#             introns[str(i)] = {"Chromosome": chrom,
#                                "Start": intron_start,
#                                "End": intron_end,
#                                "Strand": strand,
#                                "Feature": feature,
#                                "gene_id": gene_id,
#                                "transcript_id": tx_id}
#     return pd.DataFrame.from_dict(introns, orient = "index")
        


# def introns_by_tx(gr, by="transcript_id", nb_cpu=1):
#     '''
#     '''
    
#     return gr.apply(lambda df: df.groupby(by).apply(introns_from_df), nb_cpu=1)
    
    
# test_introns

In [None]:
introns_by_tx(test_introns)

In [None]:
introns_by_tx(pr.data.ensembl_gtf().subset(lambda df: (df.gene_id == "ENSG00000205231") & (df.Feature == "exon")))

In [None]:

pr.data.ensembl_gtf().subset(lambda df: df.transcript_id == "ENST00000450305")[["transcript_id","Feature"]].features.introns(by="transcript").dtypes

In [None]:
# test_ref.features.introns(by="transcript")

(pr.data.ensembl_gtf().subset(lambda df: df.transcript_id == "ENST00000450305")
#  [["gene_id","transcript_id"]]
 .features.introns(by="transcript"))


In [None]:
pr.data.ensembl_gtf().subset(lambda df: (df.gene_id == "ENSG00000205231") & (df.Feature == "exon"))

In [None]:
pr.data.ensembl_gtf().subset(lambda df: (df.gene_id == "ENSG00000205231")).features.introns(by="transcript")
#Start of first feature, end of next along (if properly sorted)

In [75]:
path_stie_gtf_chr1_nov = "../two_sample_example_output/stringtie/chr1.no_ref_id.TDP43-F_S6.assembled.gtf"
path_ref_gtf = "../data/annotation/gencode.v34.annotation.gtf"

In [76]:
stie_chr1 = pr.read_gtf(path_stie_gtf_chr1_nov)
stie_chr1

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr1,StringTie,transcript,827750,859332,1000,+,.,PAPA.35,PAPA.35.14,98.466469,11.679049,30.310225,
1,chr1,StringTie,exon,827750,827775,1000,+,.,PAPA.35,PAPA.35.14,96.734062,,,1
2,chr1,StringTie,exon,829002,829104,1000,+,.,PAPA.35,PAPA.35.14,110.622597,,,2
3,chr1,StringTie,exon,847653,847806,1000,+,.,PAPA.35,PAPA.35.14,118.018204,,,3
4,chr1,StringTie,exon,851926,852110,1000,+,.,PAPA.35,PAPA.35.14,115.267273,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26008,chr1,StringTie,exon,248855557,248855635,1000,-,.,PAPA.2531,PAPA.2531.4,5.161189,,,5
26009,chr1,StringTie,exon,248855724,248855943,1000,-,.,PAPA.2531,PAPA.2531.4,4.278336,,,6
26010,chr1,StringTie,exon,248856287,248856422,1000,-,.,PAPA.2531,PAPA.2531.4,5.487612,,,7
26011,chr1,StringTie,exon,248856513,248856562,1000,-,.,PAPA.2531,PAPA.2531.4,5.115693,,,8


In [19]:
ref = pr.read_gtf(path_ref_gtf)
ref

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,11868,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,,,,,,,,,,
1,chr1,HAVANA,transcript,11868,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,,,,,
2,chr1,HAVANA,exon,11868,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,1,ENSE00002234944.1,,,
3,chr1,HAVANA,exon,12612,12721,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,2,ENSE00003582793.1,,,
4,chr1,HAVANA,exon,13220,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,3,ENSE00002312635.1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912491,chrY,HAVANA,exon,57214349,57214397,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,1,ENSE00002072208.1,PGO:0000005,,
2912492,chrY,HAVANA,exon,57213879,57213964,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,2,ENSE00002046926.1,PGO:0000005,,
2912493,chrY,HAVANA,exon,57213525,57213602,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,3,ENSE00002021169.1,PGO:0000005,,
2912494,chrY,HAVANA,exon,57213203,57213357,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,4,ENSE00002036959.1,PGO:0000005,,


In [20]:
stie_chr1_exons = stie_chr1.subset(lambda df: df["Feature"] == "exon")
stie_chr1_exons

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr1,StringTie,exon,827750,827775,1000,+,.,PAPA.35,PAPA.35.14,96.734062,,,1
1,chr1,StringTie,exon,829002,829104,1000,+,.,PAPA.35,PAPA.35.14,110.622597,,,2
2,chr1,StringTie,exon,847653,847806,1000,+,.,PAPA.35,PAPA.35.14,118.018204,,,3
3,chr1,StringTie,exon,851926,852110,1000,+,.,PAPA.35,PAPA.35.14,115.267273,,,4
4,chr1,StringTie,exon,852670,852766,1000,+,.,PAPA.35,PAPA.35.14,135.545364,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23563,chr1,StringTie,exon,248855557,248855635,1000,-,.,PAPA.2531,PAPA.2531.4,5.161189,,,5
23564,chr1,StringTie,exon,248855724,248855943,1000,-,.,PAPA.2531,PAPA.2531.4,4.278336,,,6
23565,chr1,StringTie,exon,248856287,248856422,1000,-,.,PAPA.2531,PAPA.2531.4,5.487612,,,7
23566,chr1,StringTie,exon,248856513,248856562,1000,-,.,PAPA.2531,PAPA.2531.4,5.115693,,,8


In [21]:
ref_exons = ref.subset(lambda df: df["Feature"] == "exon")
ref_chr1_exons = ref_exons.subset(lambda df: df["Chromosome"] == "chr1")
ref_exons

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,exon,11868,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,1,ENSE00002234944.1,,,
1,chr1,HAVANA,exon,12612,12721,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,2,ENSE00003582793.1,,,
2,chr1,HAVANA,exon,13220,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,3,ENSE00002312635.1,,,
3,chr1,HAVANA,exon,12009,12057,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,transcribed_unprocessed_pseudogene,DDX11L1-201,,basic,OTTHUMT00000002844.1,1,ENSE00001948541.1,PGO:0000019,,
4,chr1,HAVANA,exon,12178,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,transcribed_unprocessed_pseudogene,DDX11L1-201,,basic,OTTHUMT00000002844.1,2,ENSE00001671638.2,PGO:0000019,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1378883,chrY,HAVANA,exon,57214349,57214397,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,1,ENSE00002072208.1,PGO:0000005,,
1378884,chrY,HAVANA,exon,57213879,57213964,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,2,ENSE00002046926.1,PGO:0000005,,
1378885,chrY,HAVANA,exon,57213525,57213602,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,3,ENSE00002021169.1,PGO:0000005,,
1378886,chrY,HAVANA,exon,57213203,57213357,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,4,ENSE00002036959.1,PGO:0000005,,


In [22]:
#prot coding / lncRNA
# print(ref_chr1_exons.gene_type.value_counts())
ref_chr1_pc_exons = ref_chr1_exons.subset(lambda df: df.gene_type.isin(["protein_coding", "lncRNA"]))
ref_chr1_pc_exons

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,exon,29553,30039,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-202,5,basic,OTTHUMT00000002840.1,1,ENSE00001947070.1,,,
1,chr1,HAVANA,exon,30563,30667,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-202,5,basic,OTTHUMT00000002840.1,2,ENSE00001922571.1,,,
2,chr1,HAVANA,exon,30975,31097,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-202,5,basic,OTTHUMT00000002840.1,3,ENSE00001827679.1,,,
3,chr1,HAVANA,exon,30266,30667,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-201,5,basic,OTTHUMT00000002841.1,1,ENSE00001841699.1,,,
4,chr1,HAVANA,exon,30975,31109,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-201,5,basic,OTTHUMT00000002841.1,2,ENSE00001890064.1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121907,chr1,HAVANA,exon,248857668,248858017,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-218,3,,OTTHUMT00000097304.1,3,ENSE00001904444.1,,,
121908,chr1,HAVANA,exon,248859014,248859144,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-229,4,,OTTHUMT00000382605.1,1,ENSE00002142255.1,,,
121909,chr1,HAVANA,exon,248857864,248858321,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-229,4,,OTTHUMT00000382605.1,2,ENSE00002187693.1,,,
121910,chr1,HAVANA,exon,248858917,248859033,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-228,4,,OTTHUMT00000382606.1,1,ENSE00002188413.1,,,


In [None]:
# stie_chr1_introns = introns_by_tx(stie_chr1_exons,nb_cpu=1)
# stie_chr1_introns

In [None]:
# ref_all_introns = introns_by_tx(ref_exons, nb_cpu=4)
# ref_all_introns
# ref_chr1_introns = introns_by_tx(ref_chr1_pc_exons, nb_cpu=2)
# ref_chr1_introns

In [None]:
# chr1_chain_matching = match_intron_chains(stie_chr1_introns, ref_chr1_introns)
# chr1_chain_matching

In [None]:
# ref_chr1_introns.subset(lambda df: df.transcript_id == "ENST00000674495.1")

In [None]:
# ref_chr1_exons.subset(lambda df: df.transcript_id == "ENST00000674495.1").sort()

### Alternative (hopefully more scalable approach)

Try and use pyranges internals as much as possible, and avoid manual looping and comparisons

Essentially, Find overlapping introns, then filter for those that are identical

In [71]:
def intron_id(gr):

    return gr.assign("intron_id",
                                         lambda df: pd.Series([":".join([tx_id, str(start), str(end)])
                                                     for tx_id, start, end in zip(df["transcript_id"],
                                                                                  df["Start"],
                                                                                  df["End"])
                                                    ])
                                        )

def sort_introns_by_strand(df):
    '''
    '''
    # first reset_index call removes the original index of the group (e.g. row 4005 in df)
    # second reset_index call adds the sorted index as a column to the dataframe (the order along exon in each transcript)
    if (df.Strand == '+').all():
        return df.sort_values(by=['End']).reset_index(drop=True).reset_index()
    elif (df.Strand == '-').all():
        return df.sort_values(by=['Start'], ascending=False).reset_index(drop=True).reset_index()


    
def validate_matching_chain(df, max_terminal_non_match=2):
    '''
    apply to grouped df
    '''
    
    runs, starts, vals = rle(df["match"])
    
    # Valid matches are:
    # All introns match (e.g. bleedthrough event)
    # All but last x introns match (usually one) (i.e. runs = 1,0)
    
    if np.all(vals):
        # All introns match (e.g. bleedthrough event)
        return True
    
    elif np.array_equal(vals, [1,0]) and runs[-1] <= max_terminal_non_match:
        # all but last x introns match (x = max_terminal_non_match) (i.e. runs = 1,0)
        return True
    
    else:
        return False

    

def stie_groupby_last_exon(df, exon_n_col, which="last"):
    '''
    '''
    
    if (df["Strand"] == "+").all():
        if which == "last":
            return df[exon_n_col].idxmax()
        elif which == "first":
            return df[exon_n_col].idxmin()
    
    elif (df["Strand"] == "-").all():
        if which == "last":
            return df[exon_n_col].idxmin()
        elif which == "first":
            return df[exon_n_col].idxmax()

        
def filter_multi_exon(df, exon_n_col):
    '''
    Want transcripts with > 1 exon
    '''
    if df[exon_n_col].nunique() > 1:
        return True
    else:
        return False

    
def get_terminal_exons(gr,
                   feature_col = "Feature",
                   id_col = "transcript_id",
                   exon_number_col = "exon_number",
                   source = None,
                   which_exon="last",
                   filter_single = False,
                   nb_cpu = 1):
    '''
    Return gr of last exons for each transcript_id
    In process, exon_number_col will be converted to type 'int'
    StringTie merged GTFs (or whatever tool single_steps/stringtie_longreads.smk is using) 
    reports exon_number that DOES NOT RESPECT STRAND (from browsing in IGV)
    i.e. for minus-strand - largest exon_number for transcript corresponds to FIRST EXON, not last
    Annotated (i.e. Ensembl) reported exon_numbers DO RESPECT STRAND (i.e. max always = last exon)
    
    if Do respect strand, put source = None (default)
    if Don't respect strand, put source = "stringtie" (i.e. plus strand = max, minus strand = min)
    '''
    # Pull out exons, convert exon_number to int
    mod_gr = (gr.assign(exon_number_col,
                      lambda df: df[exon_number_col].astype(int),
                      nb_cpu = nb_cpu)
             )
    
    
    # Filter out single-exon transcripts
    if filter_single:
        print("Filtering for multi-exon transcripts...")
        print("Before: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))
        
        mod_gr = (mod_gr.apply(lambda df: (df.groupby(id_col)
                                       .filter(lambda x: filter_multi_exon(df, exon_number_col))
                                      )
                           ,
                           nb_cpu= nb_cpu
                          )
                 )
        print("After: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))
    
    
    
    # Pick last exon entry by max exon number for each transcript (id_col)
    if source is None:
        
        if which_exon == "last":
            out_gr = mod_gr.apply(lambda df: df.iloc[df.groupby(id_col)[exon_number_col].idxmax(),], nb_cpu = nb_cpu)
            
        elif which_exon == "first":
            out_gr = mod_gr.apply(lambda df: df.iloc[df.groupby(id_col)[exon_number_col].idxmin(),], nb_cpu = nb_cpu)
    
    elif source == "stringtie":
        # Doesn't respect strand - pick min if Minus strand, max if plus strand
        out_gr = (mod_gr.apply(lambda df: df.iloc[(df.groupby(id_col)
                                                  .apply(lambda df: stie_groupby_last_exon(df, exon_number_col, which_exon)
                                                        )
                                                 ),],
                               nb_cpu = nb_cpu
                              ) 
                 )
        

        
    else:
        pass
    
    return out_gr

    
def filter_transcripts_by_chain(novel_exons,ref_exons, match_type = "transcript", max_terminal_non_match=2, nb_cpu = 1):
    '''
    '''

    novel_cols_to_keep = ["Feature","transcript_id"]
    ref_cols_to_keep = ["Feature", "transcript_id", "gene_id", "gene_name"]

    assert match_type in ["transcript", "any"], "match_type must be one of 'transcript' or 'any'. value passed - {}".format(str(match_type))

    #1. Find introns by transcript & give each intron a unique ID
    print("finding introns...")
    t1 = timer()
    
    novel_introns = introns_by_tx(novel_exons, nb_cpu=nb_cpu).sort()
    ref_introns = introns_by_tx(ref_exons, nb_cpu=nb_cpu).sort()

    t2 = timer()
    
    print("took {} (s)".format(t2 - t1))
    
    print("adding intron_id column...")
    
    t3 = timer()
    novel_introns = intron_id(novel_introns)
    ref_introns = intron_id(ref_introns)
    t4 = timer()
    
    print("took {} s".format(t4 - t3))

    #2. Track number of introns in each novel transcript
    novel_tx_intron_counts = (novel_introns.as_df()
                              .groupby("transcript_id").size())


    # novel_introns, ref_introns

    # 3. Store intron_ids for each transcript, sorted by intron_number (where 1 = first intron regardless of strand) in a df/Series
    print("generating df of novel txipts sorted by intron number...")
    
    t5 = timer()
    
    novel_intron_ids_ordered = (novel_introns.as_df()
                                .groupby("transcript_id")
                                .apply(sort_introns_by_strand)
                                .reset_index(drop=True)
                                .rename({"index": "intron_number"}, axis="columns")
                               )
    novel_intron_ids_ordered["intron_number"] = novel_intron_ids_ordered["intron_number"].add(1)
    

    # df of txipt_id | intron_id | intron_number
    novel_intron_ids_ordered = novel_intron_ids_ordered.loc[:,["transcript_id","intron_id","intron_number"]]
    
    t6 = timer()
    print("took {} s".format(t6 - t5))
#     print(novel_intron_ids_ordered.dtypes)

    ref_introns = (ref_introns.as_df()
                                .groupby("transcript_id")
                                .apply(sort_introns_by_strand)
                                .reset_index(drop=True)
                                .rename({"index": "intron_number"}, axis="columns")
                               )
    ref_introns["intron_number"] = ref_introns["intron_number"].add(1)
    
    ref_introns = pr.PyRanges(ref_introns, int64=True)


    #4. Find novel introns with any overlap with reference introns
    # Inner join to add ref_rows to novel gr
    print("finding overlaps between novel and reference introns...")
    
    t7 = timer()
    joined = pr.PyRanges(novel_introns.as_df(), int64=True).join(ref_introns, strandedness="same", suffix ="_ref",nb_cpu=nb_cpu)
    t8 = timer()
    
    print("took {} s".format(t8 - t7))
    
    #5. Filter for overlaps that exactly match (or differ by given tolerance)
    print("filtering overlaps for exact matches...")
    
    t9 = timer()
    joined = joined.subset(lambda df: abs(df.Start - df.Start_ref) + abs(df.End - df.End_ref) <= 0, nb_cpu=nb_cpu)
    t10 = timer()
    
    print("took {} s".format(t10 - t9))
    
    # Minimal info needed on matches between novel and reference introns
    joined = joined.as_df()[["transcript_id","intron_id","transcript_id_ref","intron_id_ref"]]

#     print(joined.dtypes)

    #6. Join ordered novel introns with match info
    #7. Assign a simple tracker column 'match' of True (where intron is matched) and False (where intron is not matched)
    
    print("preparing for filtering intron matches...")
    t11 = timer()
    
    if match_type == "any":
        # Looking for intron to match any annotated intron, regardless of reference transcript
        novel_ref_match_info = novel_intron_ids_ordered.merge(joined,
                                                              how="left",
                                                              on="intron_id",
                                                              suffixes=["_novel","_match"]
                                                             )
        
        # Assign 'match' column for each intron.
        # Since we don't really care which intron it matches, & no matches will mean NaN
        novel_ref_match_info["match"] = novel_ref_match_info["transcript_id_ref"]
        novel_ref_match_info["match"] = novel_ref_match_info["match"].fillna(0)
        novel_ref_match_info["match"] = novel_ref_match_info["match"].replace("\w*", 1, regex=True)
        
        novel_ref_match_info = novel_ref_match_info.drop_duplicates(subset=["intron_id"])
        
        # Minimal informative info is novel tx, novel intron_id & number, match column
        novel_ref_match_info = novel_ref_match_info[["transcript_id_novel","intron_id","intron_number","match"]]
        

    elif match_type == "transcript":
        # Looking for introns (except last) to match the same reference transcript
        # merge_ordered can do a 'grouped merge' filling in empty rows (introns) for each transcript_id
        # This is especially useful if want to do transcript-specific intron matching
        # For each reference transcript, all novel introns will be filled with NaN if no overlap for given transcript_id
        # (i.e. novel txipt matches all but last intron of reference transcript)

        novel_ref_match_info = (pd.merge_ordered(novel_intron_ids_ordered,
                                    joined,
                                    how="left",
                                    on="intron_id",
                                    right_by="transcript_id_ref", # group matches by ref tx & join tx by tx
                                    suffixes=["_novel","_match"],
                                    fill_method=None)
                   .sort_values(by=["transcript_id_novel","intron_number"])
                               )
        
        # merge_ordered fills rows for each intron for each ref tx in df, regardless of whether any overlap
        # .dropna(axis="rows", subset=["intron_id_ref"]) 
        novel_ref_match_info = (novel_ref_match_info.groupby(["transcript_id_novel", "transcript_id_ref"])
                                .filter(lambda df: (df["intron_id_ref"].notna()).any()) # Retained if ref tx has >=1 matching introns
                                .reset_index(drop=True))
        
        # Make a match column where 1 = match, 0 = no match for each ref id and novel intron
        novel_ref_match_info["match"] = novel_ref_match_info["intron_id_ref"]
        novel_ref_match_info["match"] = novel_ref_match_info["match"].fillna(0)
        novel_ref_match_info["match"] = novel_ref_match_info["match"].replace("\w*", 1, regex=True)
        
    t12 = timer()
    print("took {} s".format(t12 - t11))
    
    
    # 8. Filter down matching transcripts to those that all ref introns except penultimate or all introns...
    print("filtering for valid intron chain matches...")
    t13 = timer()
    if match_type == "any":
        # Only need to check by novel transcript_id
        filt_novel_ref_match_info = (novel_ref_match_info.groupby("transcript_id_novel")
                                     .filter(lambda x: validate_matching_chain(x, max_terminal_non_match)
                                            )
                                    )
    
    elif match_type == "transcript":
        # Check novel tx vs each ref tx
        filt_novel_ref_match_info = (novel_ref_match_info.groupby(["transcript_id_novel","transcript_id_ref"])
                                     .filter(lambda x: validate_matching_chain(x, max_terminal_non_match)
                                            )
                                    )
    t14 = timer()
    print("took {} s".format(t14 - t13))
        
    
    
    # Return simplified df of novel transcript_id & matching transcript_ids if applicable
    
    if match_type == "any":
        chain_match_novel_ids = filt_novel_ref_match_info["transcript_id_novel"].drop_duplicates()
    
    elif match_type == "transcript":
        chain_match_novel_ids = filt_novel_ref_match_info[["transcript_id_novel","transcript_id_ref"]].drop_duplicates()
    
    # Extract valid transcripts occurring within first intron of annotated transcript (where intron chain matching is impossible)
    # For these to be valid, the 3'end of first exons should be identical (i.e. same outgoing junction)
    
    #2.1 - Pull out non-chain matched novel isoforms & their last exons
    
    if isinstance(chain_match_novel_ids, pd.Series):
        # match type was any
        novel_exons_nm = novel_exons.subset(lambda df: ~df["transcript_id"].isin(set(chain_match_novel_ids.tolist())), nb_cpu=nb_cpu)
        


    elif isinstance(chain_match_novel_ids, pd.DataFrame):
        # match_by/match_type was transcript
        novel_exons_nm = novel_exons.subset(lambda df: ~df["transcript_id"].isin(set(chain_match_novel_ids["transcript_id_novel"].tolist())), nb_cpu=nb_cpu)

    novel_nm_last_exons = get_terminal_exons(novel_exons_nm,
                                              exon_number_col="exon_number",
                                              source="stringtie",
                                              filter_single=True,
                                              which_exon="last",
                                              nb_cpu=nb_cpu)
    
    #2.2 - Extract first introns from ref transcripts
    ref_first_introns = get_terminal_exons(ref_introns,
                                           exon_number_col="intron_number",
                                           source=None,
                                           filter_single=False,
                                           which_exon="first",
                                           nb_cpu=nb_cpu
                                          )
    
    
    #2.3 - find last exons of non-matched txs completely contained within annotated first introns
    novel_nm_fi = pr.PyRanges(novel_nm_last_exons.as_df(), int64=True).overlap(ref_first_introns,
                                               how="containment",
                                               strandedness="same",
#                                                nb_cpu=nb_cpu
                                             )
    
    #2.5 - Get 3'ends of first exons of transcripts with first-intron contained last exons
    novel_nm_fi_fe_3p = (novel_exons_nm.subset(lambda df: 
                                            df["transcript_id"].isin(set(novel_nm_fi.transcript_id.tolist())),
                                            nb_cpu=nb_cpu
                                           )
                      .three_end()
                     )
    
    #2.6 - Get 3'ends of first exons of reference transcripts
    ref_first_exons_3p = (get_terminal_exons(ref_exons,
                                             exon_number_col="exon_number",
                                             source=None,
                                             filter_single=True,
                                             which_exon="first",
                                             nb_cpu=nb_cpu
                                            )
                          .three_end()
                         )
    
    # 2.7 - Find first-intron contained novel LE isoforms with the outgoing SJ of first exon matching a ref first exon
    first_intron_contained_match = novel_nm_fi_fe_3p.join(ref_first_exons_3p,
                                                          strandedness="same",
                                                          suffix="_ref",
                                                          nb_cpu=nb_cpu
                                                         )
    
    first_intron_contained_match = first_intron_contained_match.as_df()[["transcript_id","transcript_id_ref"]].rename({"transcript_id": "transcript_id_novel"}, axis=1)
    
    
    
    return pd.concat([pd.DataFrame(chain_match_novel_ids), first_intron_contained_match]).reset_index(drop=True)

In [None]:
filter_transcripts_by_chain(test_novel_gr, test_ref)

In [72]:
filter_transcripts_by_chain(test_novel_gr, test_ref,match_type="any")

finding introns...
took 0.0867136429878883 (s)
adding intron_id column...
took 0.017458201968111098 s
generating df of novel txipts sorted by intron number...
took 0.019409961998462677 s
finding overlaps between novel and reference introns...
took 0.02133792103268206 s
filtering overlaps for exact matches...
took 0.005796824989374727 s
preparing for filtering intron matches...
took 0.004225927987135947 s
filtering for valid intron chain matches...
took 0.0026123709976673126 s
Filtering for multi-exon transcripts...
Before: 4
After: 4
Filtering for multi-exon transcripts...
Before: 2
After: 2


Unnamed: 0,transcript_id_novel,transcript_id_ref
0,nov_tx_3ui_p,
1,nov_tx_bl_p,
2,nov_tx_exc_dist_p,
3,nov_tx_le_dist_p,
4,nov_tx_le_dist_p_minus,
5,nov_tx_mult_p,
6,nov_tx_si_p,
7,nov_tx_fi_p,ref_tr_1


In [24]:
pre_chr1 = timer()
chr1_any_matched = filter_transcripts_by_chain(stie_chr1_exons, ref_chr1_pc_exons, match_type="any", nb_cpu=2)
print("total time taken: {}".format(timer() - pre_chr1))
chr1_any_matched

finding introns...
took 79.48507030098699 (s)
adding intron_id column...
took 0.17554569896310568 s
generating df of novel txipts sorted by intron number...
took 4.423515614995267 s
finding overlaps between novel and reference introns...


2021-07-26 14:19:48,128	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


took 4.683659655973315 s
filtering overlaps for exact matches...


2021-07-26 14:19:52,562	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


took 3.6306387850199826 s
preparing for filtering intron matches...
took 0.19747438799822703 s
filtering for valid intron chain matches...
took 0.42044611898018047 s
total time taken: 93.09141320799245


26        PAPA.100.1
86       PAPA.1001.2
139      PAPA.1002.1
208      PAPA.1003.1
210      PAPA.1004.1
            ...     
99544     PAPA.985.1
99546     PAPA.985.6
99612     PAPA.994.2
99678     PAPA.995.2
99690     PAPA.999.1
Name: transcript_id_novel, Length: 1191, dtype: object

In [25]:
pre_chr1 = timer()
chr1_tx_matched = filter_transcripts_by_chain(stie_chr1_exons, ref_chr1_pc_exons, match_type="transcript", nb_cpu=2)
print("time taken: {}".format(timer() - pre_chr1))
chr1_tx_matched

finding introns...
took 84.49933640897507 (s)
adding intron_id column...
took 0.162128378986381 s
generating df of novel txipts sorted by intron number...
took 4.3569289009901695 s
finding overlaps between novel and reference introns...


2021-07-26 14:21:25,989	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


RayTaskError(KeyError): [36mray::_write_both()[39m (pid=1275, ip=10.97.45.105)
  File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 330, in _function_with_tracing
    return function(*args, **kwargs)
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/pyranges/methods/join.py", line 111, in _write_both
    how = kwargs["how"]
KeyError: 'how'

2021-07-26 14:21:32,663	ERROR worker.py:78 -- Unhandled error (suppress with RAY_IGNORE_UNHANDLED_ERRORS=1): [36mray::_write_both()[39m (pid=1276, ip=10.97.45.105)
  File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 330, in _function_with_tracing
    return function(*args, **kwargs)
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/pyranges/methods/join.py", line 111, in _write_both
    how = kwargs["how"]
KeyError: 'how'


In [80]:
stie_chr1.subset(lambda df: df["transcript_id"].isin(set(chr1_any_matched.tolist())))

NameError: name 'chr1_any_matched' is not defined

In [35]:
isinstance(chr1_any_matched,pd.Series)

True

In [79]:
stie_chr1.features.introns(by="transcript")

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr1,StringTie,intron,2025054,2025333,1000,+,.,PAPA.100,PAPA.100.1,3.943609,0.467749,1.213933,
1,chr1,StringTie,intron,2025401,2025517,1000,+,.,PAPA.100,PAPA.100.1,3.943609,0.467749,1.213933,
2,chr1,StringTie,intron,2025738,2027576,1000,+,.,PAPA.100,PAPA.100.1,3.943609,0.467749,1.213933,
3,chr1,StringTie,intron,2027659,2028154,1000,+,.,PAPA.100,PAPA.100.1,3.943609,0.467749,1.213933,
4,chr1,StringTie,intron,2028292,2029110,1000,+,.,PAPA.100,PAPA.100.1,3.943609,0.467749,1.213933,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21118,chr1,StringTie,intron,71592968,71611025,1000,-,.,PAPA.995,PAPA.995.2,1.635092,0.193937,0.503319,
21119,chr1,StringTie,intron,71611146,71698007,1000,-,.,PAPA.995,PAPA.995.2,1.635092,0.193937,0.503319,
21120,chr1,StringTie,intron,71698139,71776171,1000,-,.,PAPA.995,PAPA.995.2,1.635092,0.193937,0.503319,
21121,chr1,StringTie,intron,71776297,71935078,1000,-,.,PAPA.995,PAPA.995.2,1.635092,0.193937,0.503319,


In [87]:
def add_intron_number(introns, id_col = "transcript_id", out_col="intron_number", nb_cpu=1):
    '''
    '''

    assert len(set(introns.as_df().Feature.tolist())) == 1, "only one feature type (e.g. all exons, all introns) should be present in gr"

    introns_n = (introns.assign(out_col,
                                lambda df: df.groupby(id_col)
                                             .apply(sort_introns_by_strand)
                                             .reset_index(drop=True)
                                             .rename({"index": out_col}, axis="columns")
                                             .assign(**{out_col: lambda x: x[out_col] + 1})[out_col],
                                             nb_cpu=nb_cpu
                                ))

    return introns_n

In [92]:
random_tx = pr.data.gencode_gtf().transcript_id.sample(n=5, random_state=123)
random_tx

1118    ENST00000327044.6
2308    ENST00000465727.5
2931    ENST00000354700.9
2553    ENST00000473215.5
2120    ENST00000379289.5
Name: transcript_id, dtype: object

In [93]:
testing_few_tx = pr.data.gencode_gtf().subset(lambda df: df.transcript_id.isin(random_tx.tolist()))
testing_few_tx


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,ENSEMBL,transcript,1173905,1197933,.,+,.,ENSG00000162571.13,protein_coding,...,protein_coding,TTLL10-202,2,CCDS,,,,,ENSP00000368591.1,CCDS44036.1
1,chr1,ENSEMBL,exon,1173905,1173926,.,+,.,ENSG00000162571.13,protein_coding,...,protein_coding,TTLL10-202,2,CCDS,,1,ENSE00001480388.1,,ENSP00000368591.1,CCDS44036.1
2,chr1,ENSEMBL,exon,1174284,1174321,.,+,.,ENSG00000162571.13,protein_coding,...,protein_coding,TTLL10-202,2,CCDS,,2,ENSE00001480387.1,,ENSP00000368591.1,CCDS44036.1
3,chr1,ENSEMBL,exon,1174423,1174489,.,+,.,ENSG00000162571.13,protein_coding,...,protein_coding,TTLL10-202,2,CCDS,,3,ENSE00001480465.1,,ENSP00000368591.1,CCDS44036.1
4,chr1,ENSEMBL,exon,1179188,1179333,.,+,.,ENSG00000162571.13,protein_coding,...,protein_coding,TTLL10-202,2,CCDS,,4,ENSE00002321912.1,,ENSP00000368591.1,CCDS44036.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,chr1,HAVANA,exon,1292375,1293708,.,-,.,ENSG00000131584.18,protein_coding,...,protein_coding,ACAP3-202,1,CCDS,OTTHUMT00000006366.2,24,ENSE00001889322.1,,ENSP00000346733.5,CCDS19.2
166,chr1,HAVANA,CDS,1293566,1293708,.,-,1,ENSG00000131584.18,protein_coding,...,protein_coding,ACAP3-202,1,CCDS,OTTHUMT00000006366.2,24,ENSE00001889322.1,,ENSP00000346733.5,CCDS19.2
167,chr1,HAVANA,stop_codon,1293563,1293566,.,-,0,ENSG00000131584.18,protein_coding,...,protein_coding,ACAP3-202,1,CCDS,OTTHUMT00000006366.2,24,ENSE00001889322.1,,ENSP00000346733.5,CCDS19.2
168,chr1,HAVANA,UTR,1307815,1308018,.,-,.,ENSG00000131584.18,protein_coding,...,protein_coding,ACAP3-202,1,CCDS,OTTHUMT00000006366.2,1,ENSE00001927436.1,,ENSP00000346733.5,CCDS19.2


In [94]:
testing_few_tx.subset(lambda df: df.Feature == "exon")[["transcript_id", "exon_number"]]

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,exon_number
0,chr1,1173905,1173926,+,ENST00000379289.5,1
1,chr1,1174284,1174321,+,ENST00000379289.5,2
2,chr1,1174423,1174489,+,ENST00000379289.5,3
3,chr1,1179188,1179333,+,ENST00000379289.5,4
4,chr1,1179656,1179737,+,ENST00000379289.5,5
...,...,...,...,...,...,...
68,chr1,1294717,1294816,-,ENST00000354700.9,20
69,chr1,1294401,1294628,-,ENST00000354700.9,21
70,chr1,1294089,1294199,-,ENST00000354700.9,22
71,chr1,1293822,1293933,-,ENST00000354700.9,23


In [103]:
print(len(pr.data.gencode_gtf().subset(lambda df: df.Feature == "exon").as_df().index))

(add_intron_number(pr.data.gencode_gtf().subset(lambda df: df.Feature == "exon"),
                   out_col="my_exon_number")#.subset(lambda df: df.my_exon_number == df.exon_number.astype(int))
 [["transcript_id", "my_exon_number", "exon_number"]])

2470


Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,my_exon_number,exon_number
0,chr1,11868,12227,+,ENST00000456328.2,1,1
1,chr1,12612,12721,+,ENST00000456328.2,2,2
2,chr1,13220,14409,+,ENST00000456328.2,3,3
3,chr1,12009,12057,+,ENST00000450305.2,4,1
4,chr1,12178,12227,+,ENST00000450305.2,5,2
...,...,...,...,...,...,...,...
2465,chr1,1430549,1430954,-,ENST00000434150.1,4,2
2466,chr1,1434177,1434488,-,ENST00000454562.1,5,1
2467,chr1,1430549,1430662,-,ENST00000454562.1,6,2
2468,chr1,1434177,1434520,-,ENST00000417917.1,7,1


In [108]:
def _sort_introns_by_strand(df, out_col):
    '''
    '''
    
    n_exons = len(df.index)
    
    if (df["Strand"] == "+").all():
        # first in order by txipt = left-most start position (i.e. most 5')
        return df.assign(**{out_col: list(range(1, n_exons + 1))})
    # pd.Series(list(range(1, n_exons + 1)))
    
    elif (df["Strand"] == "-").all():
        # firs in order by strart os = most 3'
        return df.assign(**{out_col: list(range(1, n_exons +1))[::-1]})


    


def _add_intron_number(introns, id_col = "transcript_id", out_col="intron_number", nb_cpu=1):
    '''
    '''

    assert len(set(introns.as_df().Feature.tolist())) == 1, "only one feature type (e.g. all exons, all introns) should be present in gr"

    # Sort by position
    introns = introns.sort()
                       
    introns_out = introns.apply(lambda df: df.groupby(id_col).apply(lambda x: _sort_introns_by_strand(x, out_col)))
                       
#     introns_n = (introns.assign(out_col,
#                                 lambda df: df.groupby(id_col)
#                                              .apply(
#                                                  _sort_introns_by_strand)
#                                              .reset_index(drop=True)
#                                              .rename({"index": out_col}, axis="columns")
#                                              .assign(**{out_col: lambda x: x[out_col] + 1})[out_col],
#                                              nb_cpu=nb_cpu
#                                 ))

    return introns_out

In [110]:
(_add_intron_number(pr.data.gencode_gtf().subset(lambda df: df.Feature == "exon"),
                   out_col="my_exon_number").subset(lambda df: df.my_exon_number == df.exon_number.astype(int))
 [["transcript_id", "my_exon_number", "exon_number"]])

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,my_exon_number,exon_number
0,chr1,11868,12227,+,ENST00000456328.2,1,1
1,chr1,12009,12057,+,ENST00000450305.2,1,1
2,chr1,12178,12227,+,ENST00000450305.2,2,2
3,chr1,12612,12697,+,ENST00000450305.2,3,3
4,chr1,12612,12721,+,ENST00000456328.2,2,2
...,...,...,...,...,...,...,...
2465,chr1,1430663,1430954,-,ENST00000417917.1,2,2
2466,chr1,1431804,1431843,-,ENST00000434150.1,1,1
2467,chr1,1434177,1434488,-,ENST00000454562.1,1,1
2468,chr1,1434177,1434520,-,ENST00000417917.1,1,1


In [113]:
stmn2_ref = pr.read_gtf("../data/annotation/STMN2.gencode.v34.annotation.gtf")
stmn2_ref

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_id,transcript_type,transcript_name,protein_id,transcript_support_level,tag,ccdsid,havana_transcript,exon_number,exon_id
0,chr8,HAVANA,gene,79611116,79666158,.,+,.,ENSG00000104435.14,protein_coding,...,,,,,,,,,,
1,chr8,HAVANA,transcript,79611116,79666158,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,,
2,chr8,HAVANA,exon,79611116,79611214,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,1.0,ENSE00002093744.2
3,chr8,HAVANA,CDS,79611195,79611214,.,+,0,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,1.0,ENSE00002093744.2
4,chr8,HAVANA,start_codon,79611195,79611198,.,+,0,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,1.0,ENSE00002093744.2
5,chr8,HAVANA,exon,79636801,79636897,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,2.0,ENSE00003660857.1
6,chr8,HAVANA,CDS,79636801,79636897,.,+,2,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,2.0,ENSE00003660857.1
7,chr8,HAVANA,exon,79641377,79641550,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,3.0,ENSE00000909925.1
8,chr8,HAVANA,CDS,79641377,79641550,.,+,2,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,3.0,ENSE00000909925.1
9,chr8,HAVANA,exon,79654870,79655062,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1.0,CCDS,CCDS43748.1,OTTHUMT00000379261.1,4.0,ENSE00001010210.1


In [114]:
get_terminal_exons(stmn2_ref.subset(lambda df: df.Feature == "exon"),which_exon="first")

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_id,transcript_type,transcript_name,protein_id,transcript_support_level,tag,ccdsid,havana_transcript,exon_number,exon_id
0,chr8,HAVANA,exon,79611116,79611214,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000220876.12,protein_coding,STMN2-201,ENSP00000220876.7,1,CCDS,CCDS43748.1,OTTHUMT00000379261.1,1,ENSE00002093744.2
1,chr8,HAVANA,exon,79611151,79611214,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000518111.5,protein_coding,STMN2-202,ENSP00000429243.1,3,CCDS,CCDS56542.1,OTTHUMT00000379266.1,1,ENSE00001538470.2
2,chr8,HAVANA,exon,79611734,79611791,.,+,.,ENSG00000104435.14,protein_coding,...,ENST00000518491.1,protein_coding,STMN2-203,ENSP00000430102.1,2,basic,,OTTHUMT00000379267.1,1,ENSE00002097140.1


In [116]:
stie_chr8 = pr.read_gtf("../two_sample_example_output/stringtie/chr8.no_ref_id.TDP43-F_S6.assembled.gtf")

stie_stmn2 = stie_chr8.subset(lambda df: df.transcript_id == "PAPA.11426.2")

stie_stmn2

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr8,StringTie,transcript,79611151,79617044,1000,+,.,PAPA.11426,PAPA.11426.2,3192.117432,378.615173,982.606506,
1,chr8,StringTie,exon,79611151,79611214,1000,+,.,PAPA.11426,PAPA.11426.2,547.977051,,,1.0
2,chr8,StringTie,exon,79616821,79617044,1000,+,.,PAPA.11426,PAPA.11426.2,3939.116699,,,2.0


In [119]:
def get_terminal_regions(gr,
                   feature_col = "Feature",
                   feature_key = "exon",
                   id_col = "transcript_id",
                   region_number_col = "exon_number",
                   source = None,
                   which_region="last",
                   filter_single = False,
                   nb_cpu = 1):
    '''
    Return gr of last exons for each transcript_id
    In process, region_number_col will be converted to type 'int'
    StringTie merged GTFs (or whatever tool single_steps/stringtie_longreads.smk is using)
    reports exon_number that DOES NOT RESPECT STRAND (from browsing in IGV)
    i.e. for minus-strand - largest exon_number for transcript corresponds to FIRST EXON, not last
    Annotated (i.e. Ensembl) reported exon_numbers DO RESPECT STRAND (i.e. max always = last exon)

    if Do respect strand, put source = None (default)
    if Don't respect strand, put source = "stringtie" (i.e. plus strand = max, minus strand = min)
    '''

    assert source in [None, "stringtie"]
    assert which_region in ["first", "last"]
    assert region_number_col in gr.columns.tolist()
    assert feature_col in gr.columns.tolist()
    assert id_col in gr.columns.tolist()

    # Make sure only 'exon' features are in the gr
    assert gr.as_df()[feature_col].drop_duplicates().tolist() == [feature_key], "only {} entries should be present in gr".format(feature_key)

    # Make sure region_number_col is int
    mod_gr = (gr.assign(region_number_col,
                      lambda df: df[region_number_col].astype(int),
                      nb_cpu = nb_cpu)
             )


    # Filter out single-exon transcripts
    if filter_single:
        print("Filtering for multi-exon transcripts...")
        print("Before: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))

        mod_gr = (mod_gr.apply(lambda df: (df.groupby(id_col)
                                       .filter(lambda x: filter_multi_exon(df, region_number_col))
                                      )
                           ,
                           nb_cpu=nb_cpu
                          )
                 )
        print("After: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))




    if source is None:
        # source = None means that 1 = first region of group regardless of strand
        # Pick last region entry by max region number for each transcript (id_col)
        # Pick first region entry by min region number for each transcript (id_col)

        if which_region == "last":
            out_gr = mod_gr.apply(lambda df: df.iloc[df.groupby(id_col)[region_number_col].idxmax(),], nb_cpu = nb_cpu)

        elif which_region == "first":
            out_gr = mod_gr.apply(lambda df: df.iloc[df.groupby(id_col)[region_number_col].idxmin(),], nb_cpu = nb_cpu)

    elif source == "stringtie":
        # Numbering Doesn't respect strand - pick min if Minus strand, max if plus strand
        out_gr = (mod_gr.apply(lambda df: df.iloc[(df.groupby(id_col)
                                                  .apply(lambda df: stie_groupby_last_exon(df, region_number_col, which_region)
                                                        )
                                                 ),],
                               nb_cpu = nb_cpu
                              )
                 )


    return out_gr

In [121]:
# Is STMN2 transcript completely contained within reference first intron (it should be...)?
ref_stmn2_introns = stmn2_ref.features.introns(by="transcript")
ref_stmn2_introns = _add_intron_number(ref_stmn2_introns)
print(ref_stmn2_introns[["gene_name", "transcript_id", "intron_number"]])

ref_stmn2_fi = get_terminal_regions(ref_stmn2_introns, feature_key="intron", region_number_col="intron_number", which_region="first")
ref_stmn2_fi

+--------------+-----------+-----------+--------------+-------------+--------------------+-----------------+
| Chromosome   | Start     | End       | Strand       | gene_name   | transcript_id      | intron_number   |
| (object)     | (int32)   | (int32)   | (category)   | (object)    | (object)           | (int64)         |
|--------------+-----------+-----------+--------------+-------------+--------------------+-----------------|
| chr8         | 79611214  | 79636801  | +            | STMN2       | ENST00000220876.12 | 1               |
| chr8         | 79611214  | 79636801  | +            | STMN2       | ENST00000518111.5  | 1               |
| chr8         | 79611791  | 79636801  | +            | STMN2       | ENST00000518491.1  | 1               |
| chr8         | 79636897  | 79641377  | +            | STMN2       | ENST00000220876.12 | 2               |
| ...          | ...       | ...       | ...          | ...         | ...                | ...             |
| chr8         | 79

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,protein_id,transcript_support_level,tag,ccdsid,havana_transcript,exon_number,exon_id,intron_number
0,chr8,HAVANA,intron,79611214,79636801,.,+,.,ENSG00000104435.14,protein_coding,...,protein_coding,STMN2-201,ENSP00000220876.7,1,CCDS,CCDS43748.1,OTTHUMT00000379261.1,,,1
1,chr8,HAVANA,intron,79611214,79636801,.,+,.,ENSG00000104435.14,protein_coding,...,protein_coding,STMN2-202,ENSP00000429243.1,3,CCDS,CCDS56542.1,OTTHUMT00000379266.1,,,1
2,chr8,HAVANA,intron,79611791,79636801,.,+,.,ENSG00000104435.14,protein_coding,...,protein_coding,STMN2-203,ENSP00000430102.1,2,basic,,OTTHUMT00000379267.1,,,1


In [126]:
stie_stmn2.subset(lambda df: df.exon_number == "2").overlap(ref_stmn2_fi, how="containment").drop_duplicate_positions()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr8,StringTie,exon,79616821,79617044,1000,+,.,PAPA.11426,PAPA.11426.2,3939.116699,,,2


In [129]:
get_terminal_regions(stie_stmn2.subset(lambda df: df.Feature == "exon"),which_region="last",source="stringtie").overlap(ref_stmn2_fi, how="containment").drop_duplicate_positions()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr8,StringTie,exon,79616821,79617044,1000,+,.,PAPA.11426,PAPA.11426.2,3939.116699,,,2


In [135]:
# Not lost to first intron filter...
# Is is lost to first exon 3'end matching (it shouldn't be... by eye ends are identical)
stie_stmn2_fe = get_terminal_regions(stie_stmn2.subset(lambda df: df.Feature == "exon"), which_region="first", source="stringtie")
ref_stmn2_fe = get_terminal_regions(stmn2_ref.subset(lambda df: df.Feature == "exon"), which_region="first")

stie_stmn2_fe.three_end().join(ref_stmn2_fe.three_end(), suffix="_ref")[["Start_ref", "End_ref", "transcript_id_ref"]]

Unnamed: 0,Chromosome,Start,End,Strand,Start_ref,End_ref,transcript_id_ref
0,chr8,79611214,79611215,+,79611214,79611215,ENST00000220876.12
1,chr8,79611214,79611215,+,79611214,79611215,ENST00000518111.5


In [139]:
stie_stmn2.subset(lambda df: df.Feature == "exon").three_end().join(ref_stmn2_fe.three_end())[["Start_b","End_b","transcript_id_b"]]

Unnamed: 0,Chromosome,Start,End,Strand,Start_b,End_b,transcript_id_b
0,chr8,79611214,79611215,+,79611214,79611215,ENST00000220876.12
1,chr8,79611214,79611215,+,79611214,79611215,ENST00000518111.5


In [144]:
df_testing = stie_stmn2_fe.three_end().as_df()
any(df_testing.Start == df_testing.End)

False