In [1]:
import pyranges as pr
import numpy as np
import pandas as pd
import os
import sys
import time
from timeit import default_timer as timer

In [2]:
# A plus strand transcript
test_ref_tr1 = {"Chromosome": [1]*4,
                "Start": [10,100,200,300],
                "End": [30,120,220,340],
                "Strand": ["+"]*4,
                "Feature": ["exon"]*4,
                "gene_id": ["ref_gene_1"]*4,
                "transcript_id": ["ref_tr_1"] * 4}


pr.from_dict(test_ref_tr1)


Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,ref_gene_1,ref_tr_1
1,1,100,120,+,exon,ref_gene_1,ref_tr_1
2,1,200,220,+,exon,ref_gene_1,ref_tr_1
3,1,300,340,+,exon,ref_gene_1,ref_tr_1


In [3]:
{"Chromosome": [],
                "Start": [],
                "End": [],
                "Strand": [],
                "Feature": [],
                "gene_id": [],
                "transcript_id": []
               }

{'Chromosome': [],
 'Start': [],
 'End': [],
 'Strand': [],
 'Feature': [],
 'gene_id': [],
 'transcript_id': []}

In [4]:
# A minus strand transcript
test_ref_tr2 = {"Chromosome": [2]*3,
                "Start": [10,80,100],
                "End": [20,90,120],
                "Strand": ["-"]*3,
                "Feature": ["exon"]*3,
                "gene_id": ["ref_gene_2"]*3,
                "transcript_id": ["ref_tr_2"]*3
               }


test_ref = pr.concat([pr.from_dict(test_ref_tr1), pr.from_dict(test_ref_tr2)])
test_ref

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,ref_gene_1,ref_tr_1
1,1,100,120,+,exon,ref_gene_1,ref_tr_1
2,1,200,220,+,exon,ref_gene_1,ref_tr_1
3,1,300,340,+,exon,ref_gene_1,ref_tr_1
4,2,10,20,-,exon,ref_gene_2,ref_tr_2
5,2,80,90,-,exon,ref_gene_2,ref_tr_2
6,2,100,120,-,exon,ref_gene_2,ref_tr_2


In [5]:
# Now make test novel transcripts to cover my test cases

# 1. Novel last exons in first intron of annotated transcript (e.g. STMN2)
# For these to pass, they should share an identical 3'end with a first exon of a known transcript

test_novel_fi = {"Chromosome": [1]*4,
                "Start": [10,50]*2,
                "End": [30,70,35,70],
                "Strand": ["+"]*4,
                "Feature": ["exon"]*4,
                "gene_id": ["nov_gene_1"]*4,
                "transcript_id": ["nov_tx_fi_p"]*2 + ["nov_tx_fi_f"]*2,
               }

pr.from_dict(test_novel_fi)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_fi_p
1,1,50,70,+,exon,nov_gene_1,nov_tx_fi_p
2,1,10,35,+,exon,nov_gene_1,nov_tx_fi_f
3,1,50,70,+,exon,nov_gene_1,nov_tx_fi_f


In [6]:
# 2. Internal intron, spliced in last exon (fully contained within last exon) (e.g. ONECUT1)
# For these to pass, they should match the intron chain of a known transcript up until the penultimate exon

test_novel_si = {"Chromosome": [1]*6,
                "Start": [10,100,140] + [50,100,140],
                "End": [30,120,160] + [70,120,160],
                "Strand": ["+"]*6,
                "Feature": ["exon"]*6,
                "gene_id": ["nov_gene_1"]*6,
                "transcript_id": ["nov_tx_si_p"]*3 + ["nov_tx_si_f"]*3
               }

pr.from_dict(test_novel_si)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_si_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_si_p
2,1,140,160,+,exon,nov_gene_1,nov_tx_si_p
3,1,50,70,+,exon,nov_gene_1,nov_tx_si_f
4,1,100,120,+,exon,nov_gene_1,nov_tx_si_f
5,1,140,160,+,exon,nov_gene_1,nov_tx_si_f


In [7]:
# 3. Internal intron bleedthrough (e.g. SIN3B)
# For these events to pass, they should match the intron chain of a known transcript up until the penultimate exon

test_novel_bl = {"Chromosome": [1]*6,
                "Start": [10,100,200] + [50,100,200],
                "End": [30,120,240] + [70,120,240],
                "Strand": ["+"]*6,
                "Feature": ["exon"]*6,
                "gene_id": ["nov_gene_1"]*6,
                "transcript_id": ["nov_tx_bl_p"]*3 + ["nov_tx_bl_f"]*3
               }

pr.from_dict(test_novel_bl)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_bl_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_bl_p
2,1,200,240,+,exon,nov_gene_1,nov_tx_bl_p
3,1,50,70,+,exon,nov_gene_1,nov_tx_bl_f
4,1,100,120,+,exon,nov_gene_1,nov_tx_bl_f
5,1,200,240,+,exon,nov_gene_1,nov_tx_bl_f


In [8]:
# 4. Internal intron with novel internal and terminal exon (e.g.)
# For these events to pass, they should match the intron chain of a known transcript,
# but have a continuous chain of length n of novel events at the 3'end of the transcript
# (n can be varied)
# Event know from NP is fully contained within annotated intron - also set this constraint?

test_novel_mult = {"Chromosome": [1]*4,
                   "Start": [10,100,130,150],
                   "End": [30,120,140,160],
                   "Strand": ["+"]*4,
                   "Feature": ["exon"]*4,
                   "gene_id": ["nov_gene_1"]*4,
                   "transcript_id": ["nov_tx_mult_p"]*4
                    }

pr.from_dict(test_novel_mult)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_mult_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_mult_p
2,1,130,140,+,exon,nov_gene_1,nov_tx_mult_p
3,1,150,160,+,exon,nov_gene_1,nov_tx_mult_p


In [9]:
# 5. 3'UTR intron fully contained within an annotated 3'UTR (e.g. TDP-43)
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
# (Annotate as a 3'UTR intron (spliced out) after filtering for intron chain match)
test_novel_3ui = {"Chromosome": [1]*5,
                "Start": [10,100,200,300,330],
                "End": [30,120,220,310,340],
                "Strand": ["+"]*5,
                "Feature": ["exon"]*5,
                "gene_id": ["nov_gene_1"]*5,
                "transcript_id": ["nov_tx_3ui_p"]*5
                 }

pr.from_dict(test_novel_3ui)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_3ui_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_3ui_p
2,1,200,220,+,exon,nov_gene_1,nov_tx_3ui_p
3,1,300,310,+,exon,nov_gene_1,nov_tx_3ui_p
4,1,330,340,+,exon,nov_gene_1,nov_tx_3ui_p


In [10]:
#6. Distal last exon spliced from penultimate exon (i.e. a mutually exclusive last exon) (e.g. SMC1A)
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
# Will have to annotate more precisely later (i.e. differentiate from 7)

test_novel_exc_dist = {"Chromosome": [1]*4,
                "Start": [10,100,200,360],
                "End": [30,120,220,380],
                "Strand": ["+"]*4,
                "Feature": ["exon"]*4,
                "gene_id": ["nov_gene_1"]*4,
                "transcript_id": ["nov_tx_exc_dist_p"]*4
               }

pr.from_dict(test_novel_exc_dist)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_exc_dist_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_exc_dist_p
2,1,200,220,+,exon,nov_gene_1,nov_tx_exc_dist_p
3,1,360,380,+,exon,nov_gene_1,nov_tx_exc_dist_p


In [11]:
#7. Distal last exon spliced from ann
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
test_novel_le_dist = {"Chromosome": [1]*5,
                      "Start": [10,100,200,300,360],
                      "End": [30,120,220,310,380],
                      "Strand": ["+"]*5,
                      "Feature": ["exon"]*5,
                      "gene_id": ["nov_gene_1"]*5,
                      "transcript_id": ["nov_tx_le_dist_p"]*5
                     }

pr.from_dict(test_novel_le_dist)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_le_dist_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_le_dist_p
2,1,200,220,+,exon,nov_gene_1,nov_tx_le_dist_p
3,1,300,310,+,exon,nov_gene_1,nov_tx_le_dist_p
4,1,360,380,+,exon,nov_gene_1,nov_tx_le_dist_p


In [12]:
#8. Distal last exon spliced from annoatrd (minus strand)
# For this to pass, they should match the intron chain of a known transcript up until the penultimate exon
test_novel_le_dist_minus = {"Chromosome": [2]*3,
                      "Start": [40,80,100],
                      "End": [50,90,120],
                      "Strand": ["-"]*3,
                      "Feature": ["exon"]*3,
                      "gene_id": ["nov_gene_2"]*3,
                      "transcript_id": ["nov_tx_le_dist_p_minus"]*3
                     }

pr.from_dict(test_novel_le_dist_minus)

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,2,40,50,-,exon,nov_gene_2,nov_tx_le_dist_p_minus
1,2,80,90,-,exon,nov_gene_2,nov_tx_le_dist_p_minus
2,2,100,120,-,exon,nov_gene_2,nov_tx_le_dist_p_minus


In [13]:
test_novel_gr = pr.concat([pr.from_dict(event) for event in [test_novel_3ui,
                                                            test_novel_bl,
                                                            test_novel_exc_dist,
                                                            test_novel_fi,
                                                            test_novel_le_dist,
                                                            test_novel_mult,
                                                            test_novel_si,
                                                            test_novel_le_dist_minus]
                          ]
                         )

test_novel_gr

Unnamed: 0,Chromosome,Start,End,Strand,Feature,gene_id,transcript_id
0,1,10,30,+,exon,nov_gene_1,nov_tx_3ui_p
1,1,100,120,+,exon,nov_gene_1,nov_tx_3ui_p
2,1,200,220,+,exon,nov_gene_1,nov_tx_3ui_p
3,1,300,310,+,exon,nov_gene_1,nov_tx_3ui_p
4,1,330,340,+,exon,nov_gene_1,nov_tx_3ui_p
5,1,10,30,+,exon,nov_gene_1,nov_tx_bl_p
6,1,100,120,+,exon,nov_gene_1,nov_tx_bl_p
7,1,200,240,+,exon,nov_gene_1,nov_tx_bl_p
8,1,50,70,+,exon,nov_gene_1,nov_tx_bl_f
9,1,100,120,+,exon,nov_gene_1,nov_tx_bl_f


In [14]:
def introns_from_df(df):
    '''
    '''
    
    n_exons = len(df)
    
    if n_exons < 2:
        return None
        #print(df)
        #raise Exception("at least two exons are required for transcript to have an intron")
    # n exons = n-1 introns
    
    strand = df["Strand"].drop_duplicates().tolist()[0]
#     print(strand)
    chrom = df["Chromosome"].drop_duplicates().tolist()[0]
    gene_id = df["gene_id"].drop_duplicates().tolist()[0]
    tx_id = df["transcript_id"].drop_duplicates().tolist()[0]
    feature = "intron"
    introns = {}
    for i in range(0, n_exons - 1):
        if strand == "+":
            intron_start = df.iloc[i, lambda x: x.columns.get_loc("End")]
            intron_end = df.iloc[i+1, lambda x: x.columns.get_loc("Start")]
            introns[str(i)] = {"Chromosome": chrom,
                               "Start": intron_start,
                               "End": intron_end,
                               "Strand": strand,
                               "Feature": feature,
                               "gene_id": gene_id,
                               "transcript_id": tx_id}
        elif strand == "-":
            intron_start = df.iloc[i, lambda x: x.columns.get_loc("End")]
            intron_end = df.iloc[i+1, lambda x: x.columns.get_loc("Start")]
            introns[str(i)] = {"Chromosome": chrom,
                               "Start": intron_start,
                               "End": intron_end,
                               "Strand": strand,
                               "Feature": feature,
                               "gene_id": gene_id,
                               "transcript_id": tx_id}
    return pd.DataFrame.from_dict(introns, orient = "index")
        


def introns_by_tx(gr, by="transcript_id", nb_cpu=1):
    '''
    '''
    # Sort by position (for safety)
    gr = gr.sort()
    
    return gr.apply(lambda df: df.groupby(by).apply(introns_from_df), nb_cpu=nb_cpu)
    
    
# test_novel_introns = introns_by_tx(test_novel_gr)
# test_ref_introns = introns_by_tx(test_ref)
    
    
# test_novel_introns

In [15]:
# test_ref_introns

NameError: name 'test_ref_introns' is not defined

In [16]:
def rle(inarray):
        """
        run length encoding. Partial credit to R rle function. 
        Multi datatype arrays catered for including non Numpy
        returns: tuple (runlengths, startpositions, values)
        https://stackoverflow.com/questions/1066758/find-length-of-sequences-of-identical-values-in-a-numpy-array-run-length-encodi
        Thomas Browne
        """
        ia = np.asarray(inarray)                # force numpy
        n = len(ia)
        if n == 0: 
            return (None, None, None)
        else:
            y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
            i = np.append(np.where(y), n - 1)   # must include last element posi
            z = np.diff(np.append(-1, i))       # run lengths
            p = np.cumsum(np.append(0, z))[:-1] # positions
            return(z, p, ia[i])

In [None]:
# Dict of {novel_tx_id: {matches: [ref_id], chain_match: [n_matching], terminal_non_match: [n_not_matching]}}
novel_info_dict = {}

test_novel_gr.apply(lambda df: df.groupby("transcript_id").filter(lambda grp: len(grp) > 2))

for key, dfs in pr.itergrs([test_novel_introns.apply(lambda df: df.groupby("transcript_id").filter(lambda grp: len(grp) > 1)), test_ref_introns], strand=True, keys=True):
    print("----processing chrom and strand pair {0} & {1}".format(key[0], key[1]))
  
    by_tx = tuple([df.groupby("transcript_id") for df in dfs])
    novel_txipts = by_tx[0]
    ref_txipts = by_tx[1]
    
    strand = key[1]
#     print(strand)
#    #1. for each novel transcript, test it against all ref transcripts:
    
    
    for novel_id, novel_introns in novel_txipts:
#         print(novel_id)
#         print(novel_introns)
#         print("class of novel_introns is {}".format(type(novel_introns)))
        #print(exons)
        
        if strand == "-":
            #First in order = last intron - reverse df now
            novel_introns = novel_introns[::-1].reset_index(drop=True)
#             ref_introns = ref_introns[::-1].reset_index(drop=True)
            #print(exons)
        #by_tx[1]
        else:
            pass
        
        
        if strand == "+":
            first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                            ref_introns.head(1)[["Start","End"]]) for ref_id,ref_introns in ref_txipts]
        else:
            first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                            ref_introns[::-1].reset_index(drop=True) # rev order
                                            .head(1)[["Start","End"]]
                                           )
                             for ref_id,ref_introns in ref_txipts]

        #print(novel_id)
#         print(first_matches)
        if not sum(first_matches) > 0:
            print("{0} does not match any reference transcripts in its first intron. Skipping".format(novel_id))
            continue
        
        # Now compare novel transcript against each ref transcript with match in first intron.
        # 
        for ref_tr, first_match in zip(ref_txipts, first_matches):
            
            if not first_match:
                continue
            #intron_chain_match = np.equal(np.asarray(exons[["Start","End"]]), np.asarray(ref_tr[1][["Start","End"]]))
            ref_introns = ref_tr[1]
            
            if strand == "-":
                ref_introns = ref_introns[::-1].reset_index(drop=True) # rev order
            
            n_novel_introns = len(novel_introns)
            n_ref_introns = len(ref_introns)
            
            # To avoid a slicing error for txipts shorter than novel
            max_chain = min(n_novel_introns, n_ref_introns)
#             print(max_chain)
#             print(ref_tr[1].iloc[0,])
#             print(n_novel_introns)
#             print(len(ref_tr[1]))
            novel_chain_match = pd.DataFrame([(novel_introns.iloc[i,:][["Start","End"]]
                                               .eq(ref_introns.iloc[i,:][["Start","End"]]
                                                  )
                                              )
                                              for i in range(max_chain)])
    
            # Collapse to single True/False per row - does intron match?
            novel_chain_match = novel_chain_match.apply(np.all, axis=1, raw=True)
            
#             print(novel_chain_match)
            
            runs, starts, vals = rle(novel_chain_match)
            #print(runs[0])
            #print(starts)
            #print("\n {0}".format(np.where(vals == False)))
            
            if np.all(vals) or np.array_equal(vals, [True,False]):
                # Possible genuine match, should update dict with info
                
                # match (from start of ref Txipt)
                
                if vals.size == 1:
                    #i.e. all true/introns match (e.g. bleedthrough/ where )
                    terminal_non_match = 0
                else:
                    terminal_non_match = runs[1]
                
#                 print(terminal_non_match)
                
                if novel_id not in novel_info_dict:
                    
                    novel_info_dict[novel_id] = {"matches": [ref_tr[0]],
                                                 "chain_match": [runs[0]], #Always starts with true, so take length of true
                                                 "terminal_non_match": [terminal_non_match],
                                                }
                
                else:
                    # Append to dict
                    novel_info_dict[novel_id]["matches"].append(ref_tr[0])
                    novel_info_dict[novel_id]["chain_match"].append(ref_tr[0])
                    novel_info_dict[novel_id]["terminal_non_match"].append(ref_tr[0])
                    
                
            else:
                continue
            
            
            
print(novel_info_dict)
#             intron_chain_match = exons[["Start","End"]].eq(ref_tr[1][["Start","End"]])
#             print("\n")
#             print(intron_chain_match)
#             print(intron_chain_match.apply(np.all, axis = 1))
            
            
                #loc[np.where(lambda x: (x[["Start", "End"]].all()))])
                                                  #apply(lambda x: all(x), axis = "index", raw = True))
#             print(intron_chain_match)
            
            
#         first_intron = exons.head(1)[["Start","End"]] if strand == "+" else exons.tail(1)[["Start","End"]]
        
#         grp_filter_matches = by_tx[1].filter(lambda x: np.array_equal(first_intron, x.head(1)[["Start", "End"]]))
#         print(grp_filter_matches)
#         print(type(grp_filter_matches))
     #for novel, ref_tr in zip(by_tx[0],by_tx[1]:
#         print(nov_tr[1])
#         print(ref_tr)
             
pd.concat({key: pd.DataFrame.from_dict(d, orient = "columns") for key, d in novel_info_dict.items()}, axis=0)

In [None]:
def match_intron_chains(novel_gr, ref_gr, id_col = "transcript_id", nb_cpu = 1):
    '''
    grs should contain introns
    '''
    # {novel_id: {matches: [ref_id], chain_match: [n], terminal_non_match: [n]}}
    match_info_dict = {}
    

    
    for key, dfs in pr.itergrs([novel_gr, ref_gr], strand=True, keys=True):
        print("----processing chrom and strand pair {0} & {1}".format(key[0], key[1]))
        # dfs = novel_gr & ref_gr matched by key (chromosome & strand)
        # pandas group by objects of (transcript_id, df)
        by_tx = tuple([df.groupby("transcript_id") for df in dfs])
        
        novel_txipts = by_tx[0]
        ref_txipts = by_tx[1]
        
        # Pyranges keys are tuples of (chr,strand)
        strand = key[1]
        
        
        #Comparing each novel transcript against all ref transcripts
        for novel_id, novel_introns in novel_txipts:
        
            if strand == "-":
                # Standard PyRanges sort - First in df = last intron (smallest values i.e. leftmost)
                # Reverse so first row in df is always the first intron
                # is reset_index necessary?
                novel_introns = novel_introns[::-1].reset_index(drop=True)
            
            else:
                pass
        
            # As a first pass, check for matches in intron chain of first introns between novel and reference Txs
            if strand == "+":
                first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                                ref_introns.head(1)[["Start","End"]]) 
                                 for ref_id,ref_introns in ref_txipts]
            else:
                first_matches = [np.array_equal(novel_introns.head(1)[["Start","End"]],
                                                ref_introns[::-1].reset_index(drop=True) # rev order so 1st row = 1st intron
                                                .head(1)[["Start","End"]]
                                               )
                                 for ref_id,ref_introns in ref_txipts]


            if not sum(first_matches) > 0:
                print("{0} does not match any reference transcripts in its first intron. Skipping".format(novel_id))
                continue
        
            # Compare full introns chains of novel transcript against each ref transcript with match in first intron.
        
            for ref_tr, first_match in zip(ref_txipts, first_matches):
            
                if not first_match:
                    continue
                
            
                ref_id = ref_tr[0]
#                 print("ref_id {}".format(ref_id))
                ref_introns = ref_tr[1]
            
                if strand == "-":
                    ref_introns = ref_introns[::-1].reset_index(drop=True) # reverse so first row = first intron
                        
                # To avoid a slicing error for ref txipts shorter than novel
                n_novel_introns = len(novel_introns)
                n_ref_introns = len(ref_introns)
                max_chain = min(n_novel_introns, n_ref_introns)
#             print(max_chain)
#             print(ref_tr[1].iloc[0,])
#             print(n_novel_introns)
#             print(len(ref_tr[1]))

                # Row-wise, check whether match with corresponding intron of reference transcript
                novel_chain_match = pd.DataFrame([(novel_introns.iloc[i,:][["Start","End"]]
                                                   .eq(ref_introns.iloc[i,:][["Start","End"]]
                                                      )
                                                  )
                                                  for i in range(max_chain)])
    
                # Collapse to single True/False per row - does intron completely match?
                novel_chain_match = novel_chain_match.apply(np.all, axis=1, raw=True)
            
#             print(novel_chain_match)
            
                runs, starts, vals = rle(novel_chain_match)
            #print(runs[0])
            #print(starts)
            #print("\n {0}".format(np.where(vals == False)))
            
                # Considered a valid match if a intron chain completely identical 
                # or matches at beginning of txipt but differs at the end
                if np.all(vals) or np.array_equal(vals, [True,False]):
                
                    # Don't want to throw away all valid matches (yet)
                    # Possible genuine match, should update dict with info
                    # match (from start of ref Txipt)
                
                    if vals.size == 1:
                        #i.e. all true/introns match (e.g. bleedthrough)
                        terminal_non_match = 0
                    else:
                        # All valid = consective match & non-match, so non-match = 2nd in array 
                        terminal_non_match = runs[1]
                
#                 print(terminal_non_match)
                
                    if novel_id not in match_info_dict:
                    
                        match_info_dict[novel_id] = {"matches": [ref_id],
                                                     "chain_match": [runs[0]], #Always starts with true, so take length of true
                                                     "terminal_non_match": [terminal_non_match],
                                                    }
                
                    else:
                        # Append to dict
                        match_info_dict[novel_id]["matches"].append(ref_id)
                        match_info_dict[novel_id]["chain_match"].append(runs[0])
                        match_info_dict[novel_id]["terminal_non_match"].append(terminal_non_match)
                    
                
                else:
                    continue
    
    # Output df for easier parsing
#     match_df = pd.concat({key: pd.DataFrame.from_dict(d, orient = "columns") for key, d in match_info_dict.items()}, axis=0)
    
    return match_info_dict
        


In [None]:
match_intron_chains(test_novel_introns, test_ref_introns)

In [None]:
np.array_equal(np.array([True, False]), [True, False])
np.all([True,True])

In [None]:
# test_introns = pr.from_dict(test_novel_3ui)#.subset(lambda df: df["transcript_id"] == "nov_tx_3ui_p")

# n_exons = len(test_introns)
# # n exons = n -1 introns

# # plus strand txipt - End of first exon = start of intron, Start of 3'exon = end coord

# test_introns.as_df().iloc[0,lambda x: x.columns.get_loc("End")]

# def introns_from_df(df):
#     '''
#     '''
    
#     n_exons = len(df)
    
#     if n_exons < 2:
#         raise Exception("at least two exons are required for transcript to have an intron")
#     # n exons = n-1 introns
    
#     strand = df["Strand"].drop_duplicates().tolist()[0]
# #     print(strand)
#     chrom = df["Chromosome"].drop_duplicates().tolist()[0]
#     gene_id = df["gene_id"].drop_duplicates().tolist()[0]
#     tx_id = df["transcript_id"].drop_duplicates().tolist()[0]
#     feature = "intron"
#     introns = {}
#     for i in range(0, n_exons - 1):
#         if strand == "+":
#             intron_start = df.iloc[i, lambda x: x.columns.get_loc("End")]
#             intron_end = df.iloc[i+1, lambda x: x.columns.get_loc("Start")]
#             introns[str(i)] = {"Chromosome": chrom,
#                                "Start": intron_start,
#                                "End": intron_end,
#                                "Strand": strand,
#                                "Feature": feature,
#                                "gene_id": gene_id,
#                                "transcript_id": tx_id}
#         elif strand == "-":
#             intron_end = df.iloc[i, lambda x: x.columns.get_loc("Start")]
#             intron_start = df.iloc[i+1, lambda x: x.columns.get_loc("End")]
#             introns[str(i)] = {"Chromosome": chrom,
#                                "Start": intron_start,
#                                "End": intron_end,
#                                "Strand": strand,
#                                "Feature": feature,
#                                "gene_id": gene_id,
#                                "transcript_id": tx_id}
#     return pd.DataFrame.from_dict(introns, orient = "index")
        


# def introns_by_tx(gr, by="transcript_id", nb_cpu=1):
#     '''
#     '''
    
#     return gr.apply(lambda df: df.groupby(by).apply(introns_from_df), nb_cpu=1)
    
    
# test_introns

In [None]:
introns_by_tx(test_introns)

In [None]:
introns_by_tx(pr.data.ensembl_gtf().subset(lambda df: (df.gene_id == "ENSG00000205231") & (df.Feature == "exon")))

In [None]:

pr.data.ensembl_gtf().subset(lambda df: df.transcript_id == "ENST00000450305")[["transcript_id","Feature"]].features.introns(by="transcript").dtypes

In [None]:
# test_ref.features.introns(by="transcript")

(pr.data.ensembl_gtf().subset(lambda df: df.transcript_id == "ENST00000450305")
#  [["gene_id","transcript_id"]]
 .features.introns(by="transcript"))


In [None]:
pr.data.ensembl_gtf().subset(lambda df: (df.gene_id == "ENSG00000205231") & (df.Feature == "exon"))

In [None]:
pr.data.ensembl_gtf().subset(lambda df: (df.gene_id == "ENSG00000205231")).features.introns(by="transcript")
#Start of first feature, end of next along (if properly sorted)

In [17]:
path_stie_gtf_chr1_nov = "../two_sample_example_output/stringtie/chr1.no_ref_id.TDP43-F_S6.assembled.gtf"
path_ref_gtf = "../data/annotation/gencode.v34.annotation.gtf"

In [18]:
stie_chr1 = pr.read_gtf(path_stie_gtf_chr1_nov)
stie_chr1

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr1,StringTie,transcript,827750,859332,1000,+,.,PAPA.35,PAPA.35.14,98.466469,11.679049,30.310225,
1,chr1,StringTie,exon,827750,827775,1000,+,.,PAPA.35,PAPA.35.14,96.734062,,,1
2,chr1,StringTie,exon,829002,829104,1000,+,.,PAPA.35,PAPA.35.14,110.622597,,,2
3,chr1,StringTie,exon,847653,847806,1000,+,.,PAPA.35,PAPA.35.14,118.018204,,,3
4,chr1,StringTie,exon,851926,852110,1000,+,.,PAPA.35,PAPA.35.14,115.267273,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26008,chr1,StringTie,exon,248855557,248855635,1000,-,.,PAPA.2531,PAPA.2531.4,5.161189,,,5
26009,chr1,StringTie,exon,248855724,248855943,1000,-,.,PAPA.2531,PAPA.2531.4,4.278336,,,6
26010,chr1,StringTie,exon,248856287,248856422,1000,-,.,PAPA.2531,PAPA.2531.4,5.487612,,,7
26011,chr1,StringTie,exon,248856513,248856562,1000,-,.,PAPA.2531,PAPA.2531.4,5.115693,,,8


In [19]:
ref = pr.read_gtf(path_ref_gtf)
ref

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,11868,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,,,,,,,,,,
1,chr1,HAVANA,transcript,11868,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,,,,,
2,chr1,HAVANA,exon,11868,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,1,ENSE00002234944.1,,,
3,chr1,HAVANA,exon,12612,12721,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,2,ENSE00003582793.1,,,
4,chr1,HAVANA,exon,13220,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,3,ENSE00002312635.1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912491,chrY,HAVANA,exon,57214349,57214397,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,1,ENSE00002072208.1,PGO:0000005,,
2912492,chrY,HAVANA,exon,57213879,57213964,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,2,ENSE00002046926.1,PGO:0000005,,
2912493,chrY,HAVANA,exon,57213525,57213602,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,3,ENSE00002021169.1,PGO:0000005,,
2912494,chrY,HAVANA,exon,57213203,57213357,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,4,ENSE00002036959.1,PGO:0000005,,


In [20]:
stie_chr1_exons = stie_chr1.subset(lambda df: df["Feature"] == "exon")
stie_chr1_exons

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr1,StringTie,exon,827750,827775,1000,+,.,PAPA.35,PAPA.35.14,96.734062,,,1
1,chr1,StringTie,exon,829002,829104,1000,+,.,PAPA.35,PAPA.35.14,110.622597,,,2
2,chr1,StringTie,exon,847653,847806,1000,+,.,PAPA.35,PAPA.35.14,118.018204,,,3
3,chr1,StringTie,exon,851926,852110,1000,+,.,PAPA.35,PAPA.35.14,115.267273,,,4
4,chr1,StringTie,exon,852670,852766,1000,+,.,PAPA.35,PAPA.35.14,135.545364,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23563,chr1,StringTie,exon,248855557,248855635,1000,-,.,PAPA.2531,PAPA.2531.4,5.161189,,,5
23564,chr1,StringTie,exon,248855724,248855943,1000,-,.,PAPA.2531,PAPA.2531.4,4.278336,,,6
23565,chr1,StringTie,exon,248856287,248856422,1000,-,.,PAPA.2531,PAPA.2531.4,5.487612,,,7
23566,chr1,StringTie,exon,248856513,248856562,1000,-,.,PAPA.2531,PAPA.2531.4,5.115693,,,8


In [21]:
ref_exons = ref.subset(lambda df: df["Feature"] == "exon")
ref_chr1_exons = ref_exons.subset(lambda df: df["Chromosome"] == "chr1")
ref_exons

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,exon,11868,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,1,ENSE00002234944.1,,,
1,chr1,HAVANA,exon,12612,12721,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,2,ENSE00003582793.1,,,
2,chr1,HAVANA,exon,13220,14409,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,processed_transcript,DDX11L1-202,1,basic,OTTHUMT00000362751.1,3,ENSE00002312635.1,,,
3,chr1,HAVANA,exon,12009,12057,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,transcribed_unprocessed_pseudogene,DDX11L1-201,,basic,OTTHUMT00000002844.1,1,ENSE00001948541.1,PGO:0000019,,
4,chr1,HAVANA,exon,12178,12227,.,+,.,ENSG00000223972.5,transcribed_unprocessed_pseudogene,...,transcribed_unprocessed_pseudogene,DDX11L1-201,,basic,OTTHUMT00000002844.1,2,ENSE00001671638.2,PGO:0000019,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1378883,chrY,HAVANA,exon,57214349,57214397,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,1,ENSE00002072208.1,PGO:0000005,,
1378884,chrY,HAVANA,exon,57213879,57213964,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,2,ENSE00002046926.1,PGO:0000005,,
1378885,chrY,HAVANA,exon,57213525,57213602,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,3,ENSE00002021169.1,PGO:0000005,,
1378886,chrY,HAVANA,exon,57213203,57213357,.,-,.,ENSG00000227159.8_PAR_Y,unprocessed_pseudogene,...,unprocessed_pseudogene,DDX11L16-201,,PAR,OTTHUMT00000058841.1,4,ENSE00002036959.1,PGO:0000005,,


In [22]:
#prot coding / lncRNA
# print(ref_chr1_exons.gene_type.value_counts())
ref_chr1_pc_exons = ref_chr1_exons.subset(lambda df: df.gene_type.isin(["protein_coding", "lncRNA"]))
ref_chr1_pc_exons

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,exon,29553,30039,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-202,5,basic,OTTHUMT00000002840.1,1,ENSE00001947070.1,,,
1,chr1,HAVANA,exon,30563,30667,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-202,5,basic,OTTHUMT00000002840.1,2,ENSE00001922571.1,,,
2,chr1,HAVANA,exon,30975,31097,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-202,5,basic,OTTHUMT00000002840.1,3,ENSE00001827679.1,,,
3,chr1,HAVANA,exon,30266,30667,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-201,5,basic,OTTHUMT00000002841.1,1,ENSE00001841699.1,,,
4,chr1,HAVANA,exon,30975,31109,.,+,.,ENSG00000243485.5,lncRNA,...,lncRNA,MIR1302-2HG-201,5,basic,OTTHUMT00000002841.1,2,ENSE00001890064.1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121907,chr1,HAVANA,exon,248857668,248858017,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-218,3,,OTTHUMT00000097304.1,3,ENSE00001904444.1,,,
121908,chr1,HAVANA,exon,248859014,248859144,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-229,4,,OTTHUMT00000382605.1,1,ENSE00002142255.1,,,
121909,chr1,HAVANA,exon,248857864,248858321,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-229,4,,OTTHUMT00000382605.1,2,ENSE00002187693.1,,,
121910,chr1,HAVANA,exon,248858917,248859033,.,-,.,ENSG00000171163.16,protein_coding,...,retained_intron,ZNF692-228,4,,OTTHUMT00000382606.1,1,ENSE00002188413.1,,,


In [None]:
# stie_chr1_introns = introns_by_tx(stie_chr1_exons,nb_cpu=1)
# stie_chr1_introns

In [None]:
# ref_all_introns = introns_by_tx(ref_exons, nb_cpu=4)
# ref_all_introns
# ref_chr1_introns = introns_by_tx(ref_chr1_pc_exons, nb_cpu=2)
# ref_chr1_introns

In [None]:
# chr1_chain_matching = match_intron_chains(stie_chr1_introns, ref_chr1_introns)
# chr1_chain_matching

In [None]:
# ref_chr1_introns.subset(lambda df: df.transcript_id == "ENST00000674495.1")

In [None]:
# ref_chr1_exons.subset(lambda df: df.transcript_id == "ENST00000674495.1").sort()

### Alternative (hopefully more scalable approach)

Try and use pyranges internals as much as possible, and avoid manual looping and comparisons

Essentially, Find overlapping introns, then filter for those that are identical

In [23]:
def intron_id(gr):

    return gr.assign("intron_id",
                                         lambda df: pd.Series([":".join([tx_id, str(start), str(end)])
                                                     for tx_id, start, end in zip(df["transcript_id"],
                                                                                  df["Start"],
                                                                                  df["End"])
                                                    ])
                                        )

def sort_introns_by_strand(df):
    '''
    '''
    # first reset_index call removes the original index of the group (e.g. row 4005 in df)
    # second reset_index call adds the sorted index as a column to the dataframe (the order along exon in each transcript)
    if (df.Strand == '+').all():
        return df.sort_values(by=['End']).reset_index(drop=True).reset_index()
    elif (df.Strand == '-').all():
        return df.sort_values(by=['Start'], ascending=False).reset_index(drop=True).reset_index()


    
def validate_matching_chain(df, max_terminal_non_match=2):
    '''
    apply to grouped df
    '''
    
    runs, starts, vals = rle(df["match"])
    
    # Valid matches are:
    # All introns match (e.g. bleedthrough event)
    # All but last x introns match (usually one) (i.e. runs = 1,0)
    
    if np.all(vals):
        # All introns match (e.g. bleedthrough event)
        return True
    
    elif np.array_equal(vals, [1,0]) and runs[-1] <= max_terminal_non_match:
        # all but last x introns match (x = max_terminal_non_match) (i.e. runs = 1,0)
        return True
    
    else:
        return False

    
def filter_transcripts_by_chain(novel_exons,ref_exons, match_type = "transcript", max_terminal_non_match=2, nb_cpu = 1):
    '''
    '''

    novel_cols_to_keep = ["Feature","transcript_id"]
    ref_cols_to_keep = ["Feature", "transcript_id", "gene_id", "gene_name"]

    assert match_type in ["transcript", "any"], "match_type must be one of 'transcript' or 'any'. value passed - {}".format(str(match_type))

    #1. Find introns by transcript & give each intron a unique ID
    print("finding introns...")
    t1 = timer()
    
    novel_introns = introns_by_tx(novel_exons, nb_cpu=nb_cpu).sort()
    ref_introns = introns_by_tx(ref_exons, nb_cpu=nb_cpu).sort()

    t2 = timer()
    
    print("took {} (s)".format(t2 - t1))
    
    print("adding intron_id column...")
    
    t3 = timer()
    novel_introns = intron_id(novel_introns)
    ref_introns = intron_id(ref_introns)
    t4 = timer()
    
    print("took {} s".format(t4 - t3))

    #2. Track number of introns in each novel transcript
    novel_tx_intron_counts = (novel_introns.as_df()
                              .groupby("transcript_id").size())


    # novel_introns, ref_introns

    # 3. Store intron_ids for each transcript, sorted by intron_number (where 1 = first intron regardless of strand) in a df/Series
    print("generating df of novel txipts sorted by intron number...")
    
    t5 = timer()
    
    novel_intron_ids_ordered = (novel_introns.as_df()
                                .groupby("transcript_id")
                                .apply(sort_introns_by_strand)
                                .reset_index(drop=True)
                                .rename({"index": "intron_number"}, axis="columns")
                               )
    novel_intron_ids_ordered["intron_number"] = novel_intron_ids_ordered["intron_number"].add(1)

    # df of txipt_id | intron_id | intron_number
    novel_intron_ids_ordered = novel_intron_ids_ordered.loc[:,["transcript_id","intron_id","intron_number"]]
    
    t6 = timer()
    print("took {} s".format(t6 - t5))
#     print(novel_intron_ids_ordered.dtypes)


    #4. Find novel introns with any overlap with reference introns
    # Inner join to add ref_rows to novel gr
    print("finding overlaps between novel and reference introns...")
    
    t7 = timer()
    joined = novel_introns.join(ref_introns, strandedness="same", suffix ="_ref",nb_cpu=nb_cpu)
    t8 = timer()
    
    print("took {} s".format(t8 - t7))
    
    #5. Filter for overlaps that exactly match (or differ by given tolerance)
    print("filtering overlaps for exact matches...")
    
    t9 = timer()
    joined = joined.subset(lambda df: abs(df.Start - df.Start_ref) + abs(df.End - df.End_ref) <= 0, nb_cpu=nb_cpu)
    t10 = timer()
    
    print("took {} s".format(t10 - t9))
    
    # Minimal info needed on matches between novel and reference introns
    joined = joined.as_df()[["transcript_id","intron_id","transcript_id_ref","intron_id_ref"]]

#     print(joined.dtypes)

    #6. Join ordered novel introns with match info
    #7. Assign a simple tracker column 'match' of True (where intron is matched) and False (where intron is not matched)
    
    print("preparing for filtering intron matches...")
    t11 = timer()
    
    if match_type == "any":
        # Looking for intron to match any annotated intron, regardless of reference transcript
        novel_ref_match_info = novel_intron_ids_ordered.merge(joined,
                                                              how="left",
                                                              on="intron_id",
                                                              suffixes=["_novel","_match"]
                                                             )
        
        # Assign 'match' column for each intron.
        # Since we don't really care which intron it matches, & no matches will mean NaN
        novel_ref_match_info["match"] = novel_ref_match_info["transcript_id_ref"]
        novel_ref_match_info["match"] = novel_ref_match_info["match"].fillna(0)
        novel_ref_match_info["match"] = novel_ref_match_info["match"].replace("\w*", 1, regex=True)
        
        novel_ref_match_info = novel_ref_match_info.drop_duplicates(subset=["intron_id"])
        
        # Minimal informative info is novel tx, novel intron_id & number, match column
        novel_ref_match_info = novel_ref_match_info[["transcript_id_novel","intron_id","intron_number","match"]]
        

    elif match_type == "transcript":
        # Looking for introns (except last) to match the same reference transcript
        # merge_ordered can do a 'grouped merge' filling in empty rows (introns) for each transcript_id
        # This is especially useful if want to do transcript-specific intron matching
        # For each reference transcript, all novel introns will be filled with NaN if no overlap for given transcript_id
        # (i.e. novel txipt matches all but last intron of reference transcript)

        novel_ref_match_info = (pd.merge_ordered(novel_intron_ids_ordered,
                                    joined,
                                    how="left",
                                    on="intron_id",
                                    right_by="transcript_id_ref", # group matches by ref tx & join tx by tx
                                    suffixes=["_novel","_match"],
                                    fill_method=None)
                   .sort_values(by=["transcript_id_novel","intron_number"])
                               )
        
        # merge_ordered fills rows for each intron for each ref tx in df, regardless of whether any overlap
        # .dropna(axis="rows", subset=["intron_id_ref"]) 
        novel_ref_match_info = (novel_ref_match_info.groupby(["transcript_id_novel", "transcript_id_ref"])
                                .filter(lambda df: (df["intron_id_ref"].notna()).any()) # Retained if ref tx has >=1 matching introns
                                .reset_index(drop=True))
        
        # Make a match column where 1 = match, 0 = no match for each ref id and novel intron
        novel_ref_match_info["match"] = novel_ref_match_info["intron_id_ref"]
        novel_ref_match_info["match"] = novel_ref_match_info["match"].fillna(0)
        novel_ref_match_info["match"] = novel_ref_match_info["match"].replace("\w*", 1, regex=True)
        
    t12 = timer()
    print("took {} s".format(t12 - t11))
    
    
    # 8. Filter down matching transcripts to those that all ref introns except penultimate or all introns...
    print("filtering for valid intron chain matches...")
    t13 = timer()
    if match_type == "any":
        # Only need to check by novel transcript_id
        filt_novel_ref_match_info = (novel_ref_match_info.groupby("transcript_id_novel")
                                     .filter(lambda x: validate_matching_chain(x, max_terminal_non_match)
                                            )
                                    )
    
    elif match_type == "transcript":
        # Check novel tx vs each ref tx
        filt_novel_ref_match_info = (novel_ref_match_info.groupby(["transcript_id_novel","transcript_id_ref"])
                                     .filter(lambda x: validate_matching_chain(x, max_terminal_non_match)
                                            )
                                    )
    t14 = timer()
    print("took {} s".format(t14 - t13))
        
     
    # Return simplified df of novel transcript_id & matching transcript_id if applicable
    
    if match_type == "any":
        return filt_novel_ref_match_info["transcript_id_novel"].drop_duplicates()
    
    elif match_type == "transcript":
        return filt_novel_ref_match_info[["transcript_id_novel","transcript_id_ref"]].drop_duplicates()

In [None]:
filter_transcripts_by_chain(test_novel_gr, test_ref)

In [None]:
filter_transcripts_by_chain(test_novel_gr, test_ref,match_type="any")

In [24]:
pre_chr1 = timer()
chr1_any_matched = filter_transcripts_by_chain(stie_chr1_exons, ref_chr1_pc_exons, match_type="any", nb_cpu=2)
print("total time taken: {}".format(timer() - pre_chr1))
chr1_any_matched

finding introns...
took 79.48507030098699 (s)
adding intron_id column...
took 0.17554569896310568 s
generating df of novel txipts sorted by intron number...
took 4.423515614995267 s
finding overlaps between novel and reference introns...


2021-07-26 14:19:48,128	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


took 4.683659655973315 s
filtering overlaps for exact matches...


2021-07-26 14:19:52,562	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


took 3.6306387850199826 s
preparing for filtering intron matches...
took 0.19747438799822703 s
filtering for valid intron chain matches...
took 0.42044611898018047 s
total time taken: 93.09141320799245


26        PAPA.100.1
86       PAPA.1001.2
139      PAPA.1002.1
208      PAPA.1003.1
210      PAPA.1004.1
            ...     
99544     PAPA.985.1
99546     PAPA.985.6
99612     PAPA.994.2
99678     PAPA.995.2
99690     PAPA.999.1
Name: transcript_id_novel, Length: 1191, dtype: object

In [25]:
pre_chr1 = timer()
chr1_tx_matched = filter_transcripts_by_chain(stie_chr1_exons, ref_chr1_pc_exons, match_type="transcript", nb_cpu=2)
print("time taken: {}".format(timer() - pre_chr1))
chr1_tx_matched

finding introns...
took 84.49933640897507 (s)
adding intron_id column...
took 0.162128378986381 s
generating df of novel txipts sorted by intron number...
took 4.3569289009901695 s
finding overlaps between novel and reference introns...


2021-07-26 14:21:25,989	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


RayTaskError(KeyError): [36mray::_write_both()[39m (pid=1275, ip=10.97.45.105)
  File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 330, in _function_with_tracing
    return function(*args, **kwargs)
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/pyranges/methods/join.py", line 111, in _write_both
    how = kwargs["how"]
KeyError: 'how'

2021-07-26 14:21:32,663	ERROR worker.py:78 -- Unhandled error (suppress with RAY_IGNORE_UNHANDLED_ERRORS=1): [36mray::_write_both()[39m (pid=1276, ip=10.97.45.105)
  File "python/ray/_raylet.pyx", line 501, in ray._raylet.execute_task
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/ray/util/tracing/tracing_helper.py", line 330, in _function_with_tracing
    return function(*args, **kwargs)
  File "/home/sam/miniconda3/envs/bioinfo/lib/python3.8/site-packages/pyranges/methods/join.py", line 111, in _write_both
    how = kwargs["how"]
KeyError: 'how'


In [27]:
stie_chr1.subset(lambda df: df["transcript_id"].isin(set(chr1_any_matched.tolist())))

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,cov,FPKM,TPM,exon_number
0,chr1,StringTie,transcript,827750,859332,1000,+,.,PAPA.35,PAPA.35.14,98.466469,11.679049,30.310225,
1,chr1,StringTie,exon,827750,827775,1000,+,.,PAPA.35,PAPA.35.14,96.734062,,,1
2,chr1,StringTie,exon,829002,829104,1000,+,.,PAPA.35,PAPA.35.14,110.622597,,,2
3,chr1,StringTie,exon,847653,847806,1000,+,.,PAPA.35,PAPA.35.14,118.018204,,,3
4,chr1,StringTie,exon,851926,852110,1000,+,.,PAPA.35,PAPA.35.14,115.267273,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12873,chr1,StringTie,exon,248855557,248855635,1000,-,.,PAPA.2531,PAPA.2531.4,5.161189,,,5
12874,chr1,StringTie,exon,248855724,248855943,1000,-,.,PAPA.2531,PAPA.2531.4,4.278336,,,6
12875,chr1,StringTie,exon,248856287,248856422,1000,-,.,PAPA.2531,PAPA.2531.4,5.487612,,,7
12876,chr1,StringTie,exon,248856513,248856562,1000,-,.,PAPA.2531,PAPA.2531.4,5.115693,,,8


True