# Extract splice junctions for AS-ALE (/ last exon spliced) events.

Due to a pretty silly move on my part, my pipeline does not output splice junctions for novel spliced last exons (nor last exons)
Given want to use SJs to quantify these events in NYGC data, need to extract using only last exon coordinates.

In pipeline, approach to define last exons:
- 3'end doesn't overlap any annotated exon
- terminal splice junction must match an annotated splice junction at the 5'end. 

Can exploit these criteria to grab SJs 'in reverse'

Strategy for novel last exons:
- Extract introns (SJs) from annotated transcripts
- find introns in which last exons are completely contained
- (w/ some form of pr.join), construct a SJ with start coordinate of annotated intron and end coordinate = start of exon coordinate
- As a sanity check, compare with AL's provided list of cryptic SJs inferred with MAJIQ - for common genes, do we get the same SJ coordinates

Strategy for annotated last exons:
- Any spliced last exon that is not captured with novel LE strategy
- pr.join with slack=1 (allow bookended intervals to overlap) annotated SJs with last exon coordinates
- Subset for exact matches at SJ 3'end/last exon 5'end
- Retain the SJs for downstream analysis

In [20]:
import pyranges as pr
import pandas as pd
import numpy as np

In [3]:
majiq_sj = pr.read_bed("../data/Everything_minus_ferguson_liu_recent_new_cryptics.junctions.bed")
ref_gtf = pr.read_gtf("../data/reference_filtered.gtf")
le = pr.read_bed("../data/2023-07-04_papa_cryptic_spliced.last_exons.bed") # generated in motifs/ subdir, all cryptic last exon coords used for iCLIP analysis

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.58 µs


In [6]:
# ref gtf has lots of attributes don't need, subset to minimal cols for analysis
ref_gtf = ref_gtf[["Feature", "gene_id", "transcript_id", "gene_name"]]

In [13]:
# subset le for cryptic LEs only
le_cryp = le.subset(lambda df: df.Name.str.contains("cryptic", regex=False))
ids_le_cryp = set(le_cryp.Name)
le_cryp

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,76871267,76871821,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.,+
1,chr1,61824444,61825501,ENSG00000132849.22_1|PATJ|spliced|cryptic,.,+
2,chr1,54634687,54639192,ENSG00000162390.18_4|ACOT11|spliced|cryptic,.,+
3,chr1,245464258,245471621,ENSG00000162849.16_2|KIF26B|spliced|cryptic,.,+
4,chr1,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+
...,...,...,...,...,...,...
102,chr22,37464524,37465718,ENSG00000100060.18_2|MFNG|spliced|cryptic,.,-
103,chrX,102721363,102724864,ENSG00000198908.12_1|BHLHB9|spliced|cryptic,.,+
104,chrX,98679426,98679978,ENSG00000281566.3_1|ENSG00000281566|spliced|cr...,.,+
105,chrX,17835910,17837395,ENSG00000131831.18_1|RAI2|spliced|cryptic,.,-


In [7]:
# extract introns (SJs) for each transcript
ref_introns = ref_gtf.features.introns(by="transcript")
# store a df noting which transcript IDs share the same intron coordinates (useful if collapse later and want to track/retain)
intron2tx = ref_introns.as_df()[["Chromosome", "Start", "End", "Strand", "transcript_id"]]



In [18]:
# first extract LEs completely contained within annotated introns
le_cryp_cont = le_cryp.overlap(ref_introns, strandedness="same", how="containment")

# now overlap-join introns with LEs
ref_introns_le = ref_introns.join(le_cryp_cont, strandedness="same")
ref_introns_le

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,transcript_id,gene_name,Start_b,End_b,Name,Score,Strand_b
0,chr1,intron,1616614,1623430,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+
1,chr1,intron,1616614,1623430,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+
2,chr1,intron,1616614,1623430,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+
3,chr1,intron,1616614,1623430,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+
4,chr1,intron,1616614,1623430,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1492,chrX,intron,108221374,108310747,-,ENSG00000197565.17,ENST00000538570.5,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-
1493,chrX,intron,108221374,108310747,-,ENSG00000197565.17,ENST00000538570.5,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-
1494,chrX,intron,108221374,108310747,-,ENSG00000197565.17,ENST00000621266.4,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-
1495,chrX,intron,108221374,108310747,-,ENSG00000197565.17,ENST00000621266.4,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-


In [23]:
# If + strand
# Start coord = intron start (i.e. 'Start' column), End coord = exon start (i.e. 'Start_b' column)
# If - strand
# Start coord = Exon start (i.e. 'End_b' column), End coord = intron start (i.e. 'End' column)

def _df_intron_exon_to_sj(df: pd.DataFrame) -> pd.DataFrame:
    '''Convert joined intron + overlapping exon df of PyRanges object to splice junction coordinates (the first bin between the two intervals)

    If + strand
    Start coord = intron start (i.e. 'Start' column), End coord = exon start (i.e. 'Start_b' column)
    If - strand
    Start coord = Exon start (i.e. 'End_b' column), End coord = intron start (i.e. 'End' column)
    
    Parameters
    ----------
    df : pd.DataFrame
        _description_

    Returns
    -------
    pd.DataFrame
        _description_

    Raises
    ------
    Exception
        Strand column of df doesn't contain all '+' or '-'
    '''

    df[["Start_intron", "End_intron"]] = df[["Start", "End"]]

    if (df.Strand == "+").all():
        df["End"] = df["Start_b"]

    elif (df.Strand == "-").all():
        df["Start"] = df["End_b"]

    else:
        raise Exception("Strand column must only contain all '+' or '-'")
    
    return df


le_sj = ref_introns_le.apply(_df_intron_exon_to_sj)
le_sj

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,transcript_id,gene_name,Start_b,End_b,Name,Score,Strand_b,Start_intron,End_intron
0,chr1,intron,1616614,1616614,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+,1616614,1616614
1,chr1,intron,1616614,1616614,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+,1616614,1616614
2,chr1,intron,1616614,1616614,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+,1616614,1616614
3,chr1,intron,1616614,1616614,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+,1616614,1616614
4,chr1,intron,1616614,1616614,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+,1616614,1616614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1492,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000538570.5,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747
1493,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000538570.5,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747
1494,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000621266.4,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747
1495,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000621266.4,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747


In [25]:
le_sj.subset(lambda df: df.End - df.Start == 0).drop_duplicate_positions() # perhaps these two are annotated in some way...

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,transcript_id,gene_name,Start_b,End_b,Name,Score,Strand_b,Start_intron,End_intron
0,chr1,intron,1616614,1616614,+,ENSG00000197530.13,ENST00000355826.10,MIB2,1616614,1619210,ENSG00000197530.13_1|MIB2|spliced|cryptic,.,+,1616614,1616614
1,chr1,intron,20776596,20776596,-,ENSG00000127483.19,ENST00000312239.10,HP1BP3,20775449,20776596,ENSG00000127483.19_1|HP1BP3|spliced|cryptic,.,-,20776596,20776596


In [26]:
# remove 0 length SJs for now as likely some annotation issue - check back at end after mined annotated SJs
le_sj = le_sj.subset(lambda df: df.Start != df.End)
le_sj

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,transcript_id,gene_name,Start_b,End_b,Name,Score,Strand_b,Start_intron,End_intron
0,chr1,intron,245419745,245464258,+,ENSG00000162849.16,ENST00000407071.7,KIF26B,245464258,245471621,ENSG00000162849.16_2|KIF26B|spliced|cryptic,.,+,245419745,245464258
1,chr1,intron,61823079,61824444,+,ENSG00000132849.22,ENST00000459752.5,PATJ,61824444,61825501,ENSG00000132849.22_1|PATJ|spliced|cryptic,.,+,61823079,61824444
2,chr1,intron,61823079,61824444,+,ENSG00000132849.22,ENST00000459752.5,PATJ,61824444,61825501,ENSG00000132849.22_1|PATJ|spliced|cryptic,.,+,61823079,61824444
3,chr1,intron,61823079,61824444,+,ENSG00000132849.22,ENST00000459752.5,PATJ,61824444,61825501,ENSG00000132849.22_1|PATJ|spliced|cryptic,.,+,61823079,61824444
4,chr1,intron,76868742,76871267,+,ENSG00000117069.15,ENST00000477717.6,ST6GALNAC5,76871267,76871821,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.,+,76868742,76871267
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1358,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000538570.5,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747
1359,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000538570.5,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747
1360,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000621266.4,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747
1361,chrX,intron,108269152,108310747,-,ENSG00000197565.17,ENST00000621266.4,COL4A6,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,108269152,108310747


In [28]:
# finally generate a cleaned BED-ready file of SJs
le_sj_clean = le_sj[["Name"]]
le_sj_clean.Score = "."
le_sj_clean

Unnamed: 0,Chromosome,Start,End,Strand,Name,Score
0,chr1,245419745,245464258,+,ENSG00000162849.16_2|KIF26B|spliced|cryptic,.
1,chr1,61823079,61824444,+,ENSG00000132849.22_1|PATJ|spliced|cryptic,.
2,chr1,61823079,61824444,+,ENSG00000132849.22_1|PATJ|spliced|cryptic,.
3,chr1,61823079,61824444,+,ENSG00000132849.22_1|PATJ|spliced|cryptic,.
4,chr1,76868742,76871267,+,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.
...,...,...,...,...,...,...
1358,chrX,108269152,108310747,-,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.
1359,chrX,108269152,108310747,-,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.
1360,chrX,108269152,108310747,-,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.
1361,chrX,108269152,108310747,-,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.
