# Get last exon coordinates for experimental validation with 3'RACE

Need:
- Cryptic last exon coordinates (with 3'end precision/resolution)
- Annotated alternative last exon (primary/dominant isoform, again with 3'end precision)

Since my last exon BEDs used for iCLIP/motif analyses are unlikely to contain these for the same gene (for ALE & IPA events), need to go back and grab them from reference annotation.
- Extract all annotated alternative last exons from reference
- Output to BED file, check in i3Neurons for major expressed isoform

3'Ext events should be straightforward - just use selected proximal and distal isoforms, with a check that proximal/distal coords are valid


In [1]:
import pyranges as pr
import pandas as pd
from typing import Literal
import os



In [2]:
gtf = pr.read_gtf("data/regions/gencode.v40.annotation.gtf")

In [3]:
# load in last exon beds
ipa = pr.read_bed("data/regions/2023-12-14_papa_cryptic_bleedthrough_uniq.background_shsy5y.last_exons.bed")
ale = pr.read_bed("data/regions/2023-12-14_papa_cryptic_spliced.background_shsy5y.last_exons.bed")
three_ext = pr.read_bed("data/regions/2023-12-15_papa_cryptic_d3utr.background_shsy5y.last_exons.bed")

In [4]:
# Extract gene names/ids for cryptic IPA and ALE events
ipa_cryp = ipa.subset(lambda df: df.Name.str.contains("cryptic", regex=False))
ale_cryp = ale.subset(lambda df: df.Name.str.contains("cryptic", regex=False))
three_ext_cryp = three_ext.subset(lambda df: df.Name.str.contains("cryptic", regex=False))

# e.g. ENSG00000119392.16_1|GLE1|bleedthrough|cryptic
ipa_cryp_gn = ipa_cryp.Name.str.split("|", expand=True)[1].tolist()
ale_cryp_gn = ale_cryp.Name.str.split("|", expand=True)[1].tolist()
cryp_gn = ipa_cryp_gn + ale_cryp_gn

len(cryp_gn)

112

In [5]:
gtf_cryp = gtf.subset(lambda df: df.gene_name.isin(cryp_gn))
gtf_cryp

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,gene,21440127,21485005,.,+,.,ENSG00000142794.19,protein_coding,...,,,,,,,,,,
1,chr1,HAVANA,transcript,21440127,21483467,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,,,,ENSP00000415711.2,CCDS57977.1
2,chr1,HAVANA,exon,21440127,21440348,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,1,ENSE00001546346.2,,ENSP00000415711.2,CCDS57977.1
3,chr1,HAVANA,exon,21444947,21445219,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,2,ENSE00003642335.1,,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,2,ENSE00003642335.1,,ENSP00000415711.2,CCDS57977.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25631,chrX,HAVANA,transcript,131823763,131830862,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-211,,TAGENE,OTTHUMT00000513644.1,,,,,
25632,chrX,HAVANA,exon,131830291,131830862,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-211,,TAGENE,OTTHUMT00000513644.1,1,ENSE00003877682.1,,,
25633,chrX,HAVANA,exon,131823763,131825365,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-211,,TAGENE,OTTHUMT00000513644.1,2,ENSE00003853674.1,,,
25634,chrX,HAVANA,transcript,131829923,131830622,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-212,,TAGENE,OTTHUMT00000505674.1,,,,,


In [6]:
gtf_cryp_ex = gtf_cryp.subset(lambda df: df.Feature == "exon")
gtf_cryp_ex.exon_number = gtf_cryp_ex.exon_number.astype(int)
gtf_cryp_ex

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,exon,21440127,21440348,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,1,ENSE00001546346.2,,ENSP00000415711.2,CCDS57977.1
1,chr1,HAVANA,exon,21444947,21445219,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,2,ENSE00003642335.1,,ENSP00000415711.2,CCDS57977.1
2,chr1,HAVANA,exon,21470631,21470734,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,3,ENSE00003734729.1,,ENSP00000415711.2,CCDS57977.1
3,chr1,HAVANA,exon,21471568,21471783,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,4,ENSE00003723776.1,,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,exon,21472842,21472915,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2,CCDS,OTTHUMT00000476522.1,5,ENSE00003753961.1,,ENSP00000415711.2,CCDS57977.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12258,chrX,HAVANA,exon,131825221,131825365,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-208,,TAGENE,OTTHUMT00000507677.1,2,ENSE00001659046.1,,,
12259,chrX,HAVANA,exon,131804594,131805037,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-208,,TAGENE,OTTHUMT00000507677.1,3,ENSE00003882991.1,,,
12260,chrX,HAVANA,exon,131830291,131830862,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-211,,TAGENE,OTTHUMT00000513644.1,1,ENSE00003877682.1,,,
12261,chrX,HAVANA,exon,131823763,131825365,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-211,,TAGENE,OTTHUMT00000513644.1,2,ENSE00003853674.1,,,


In [7]:
# TODO: make sure type hinting for get_terminal_regions is correct (by default stranded, but can be one of stranded/unstranded)
def get_terminal_regions(gr: pr.PyRanges,
                         feature_col = "Feature",
                         feature_key = "exon",
                         id_col = "transcript_id",
                         region_number_col = "exon_number",
                         number_type: Literal["stranded", "unstranded"] = "stranded",
                         which_region: str = "last",
                         filter_single = False,
                         ):
    '''Return the first/last interval in group of intervals

    Requires a column that provides a 1..n numbering of intervals within each group (can be generated by add_region_number). Extraction will always be with respect to strand (can handle strand-aware/non-strand aware ranking in region_number_col) 

    Parameters
    ----------
    gr : pr.PyRanges
        _description_
    feature_col : str, optional
        _description_, by default "Feature"
    feature_key : str, optional
        _description_, by default "exon"
    id_col : str, optional
        _description_, by default "transcript_id"
    region_number_col : str, optional
        _description_, by default "exon_number"
    number_type : Literal[&quot;stranded&quot;, &quot;unstranded&quot;], optional
        _description_, by default "stranded"
    which_region : str, optional
        _description_, by default "last"
    filter_single : bool, optional
        _description_, by default False

    Returns
    -------
    _type_
        _description_
    '''

    assert number_type in ["stranded", "unstranded"]
    assert which_region in ["first", "last"]
    assert region_number_col in gr.columns.tolist()
    assert feature_col in gr.columns.tolist()
    assert id_col in gr.columns.tolist()

    # Make sure only 'exon' features are in the gr
    assert gr.as_df()[feature_col].drop_duplicates().tolist() == [feature_key], "only {} entries should be present in gr".format(feature_key)
    # Make sure region_number_col is int
    # assert gr.as_df()[region_number_col].dtype

    # TODO: Make this into a separate function
    # Make sure region_number_col is int
    # try:
    #     mod_gr = (gr.assign(region_number_col,
    #                         lambda df: df[region_number_col].astype(float).astype(int),
    #                         nb_cpu=1)
    #               )
    # except KeyError:
    #     # Currently getting weird KeyError with assign for certain chromosome
    #     # Mostly non-std chrom names
    #     # No error if do '.<exon_number>' to assign, but this makes inflexible to colname
    #     # Also no error if gr -> df assign -> gr
    #     print("pr.assign returned KeyError. Converting {} to int via pandas df conversion".format(region_number_col))

    #     mod_gr = gr.as_df()
    #     mod_gr[region_number_col] = mod_gr[region_number_col].astype(float).astype(int)
    #     mod_gr = pr.PyRanges(mod_gr)


    # Make sure gr is sorted by transcript_id & 'region number' (ascending order so 1..n)
    mod_gr = gr.apply(lambda df: df.sort_values(by=[id_col, region_number_col], ascending=True),
                          nb_cpu=1)


    # Filter out single-exon transcripts
    if filter_single:
        print("Filtering for multi-exon transcripts...")
        print("Before: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))

        # Setting to 'False' marks all duplicates as True (so keep these)
        mod_gr = mod_gr.subset(lambda df: df.duplicated(subset=[id_col], keep=False), nb_cpu=1)

        print("After: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))


    if number_type == "stranded":
        # source = None means that 1 = first region of group regardless of strand
        # Pick last region entry by max region number for each transcript (id_col)
        # Pick first region entry by min region number for each transcript (id_col)

        # keep="last" sets last in ID to 'False' and all others true (negate to keep last only)
        # keep="first" sets first in ID to 'False'

        out_gr = mod_gr.subset(lambda df: ~(df.duplicated(subset=[id_col], keep=which_region)),
                               nb_cpu=1
                              )


    else:
        # Numbering doesn't respect strand
        # Need to flip selecting first/last in group depending on strand
        # minus strand - pick min if Minus strand, max if plus strand

        if which_region == "first":
            # + strand - pick first in group, - strand - pick last in group

            out_gr = (mod_gr.subset(lambda df:
                                    #1. plus strand & first in group/ID
                                    (df["Strand"] == "+") & ~(df.duplicated(subset=[id_col],
                                                                            keep="first")) |
                                    #2. minus strand & last in group/ID
                                    (df["Strand"] == "-") & ~(df.duplicated(subset=[id_col],
                                                                            keep="last")),
                                    nb_cpu=1)
                     )

        elif which_region == "last":
            # + strand - pick last in group/ID
            # - strand - pick first in group/ID
            out_gr = (mod_gr.subset(lambda df:
                                    #1. plus strand & last in group/ID
                                    (df["Strand"] == "+") & ~(df.duplicated(subset=[id_col],
                                                                            keep="last")) |
                                    #2. minus strand & first in group/ID
                                    (df["Strand"] == "-") & ~(df.duplicated(subset=[id_col],
                                                                            keep="first")),
                                    nb_cpu=1)
                     )


    return out_gr


def _df_collapse_metadata(df, id_col, standard_cols, collapse_cols, collapse_uniq_cols, collapse_sep):
    '''
    Intended to be applied to internal dfs of PyRanges objects
    '''

    found_collapsed = [col for col in collapse_cols if col in df.columns]

    not_found_collapsed = set(collapse_cols) - set(found_collapsed)

    if len(not_found_collapsed) > 0:
        chr_strand = f"{df.Chromosome.drop_duplicates()[0]},{df.Strand.drop_duplicates()[0]}"
        print(f"following 'collapse_cols' columns not found in df (chr/strand) - {chr_strand} - {', '.join(not_found_collapsed)}")

    grouped = df.groupby(id_col)

    # Pick first entry for all standard_cols, these should be same for all rows of id_col
    # Leaves a df with id_col values as index
    std_collapsed = grouped[standard_cols].first()

    # For collapse cols, collapse to single row of delimited strings for each column
    # Again leave a df with id_col values as index labels
    clp_collapsed = grouped[found_collapsed].agg(lambda col: collapse_sep.join(col.astype(str)))

    if collapse_uniq_cols is not None:
        # Collapse these cols to single row of delimited strings whilst dropping duplicates
        # Again leave a df with id_col values as index labels
        clp_uniq_collapsed = grouped[collapse_uniq_cols].agg(lambda col: collapse_sep.join(list(dict.fromkeys(col.astype(str)))))

        int_collapsed = clp_collapsed.merge(clp_uniq_collapsed, left_index=True, right_index=True)

        collapsed = std_collapsed.merge(int_collapsed, left_index=True, right_index=True)

    else:
        # combine by id_col
        collapsed = std_collapsed.merge(clp_collapsed, left_index=True, right_index=True)

    return collapsed


def collapse_metadata(gr,
                      id_col="transcript_id",
                      standard_cols=["Chromosome", "Start", "End", "Strand"],
                      collapse_cols=None,
                      collapse_uniq_cols=None,
                      collapse_sep=","):
    '''
    Collapse to a single entry/row per ID entry whilst retaining/collapsing metadata on duplicate rows
    standard_cols: list of column labels that have the same value for all entries of id_col and do not need to be collapsed.
    This is essential for PyRanges standard columns, as you do not want to be changing their dtypes to string. All columns labels in this list retain their dtype, and the first value is retained

    collapse_cols: list of column labels containing metadata you'd like to collapse to a single row (separated by collapse_sep)
        If None, then all columns in gr except for standard_cols, id_col & collapse_uniq_cols will be collapsed
    collapse_uniq_cols: list of column labels containing metadata you'd like to collapse to a single row whilst dropping duplicate values. Values will maintain order of appearance in df
    '''

    assert all([True if col in gr.columns else False for col in standard_cols])

    if collapse_uniq_cols is not None:
        # Here just checking the columns are found in df
        assert all([True if col in gr.columns else False for col in collapse_uniq_cols])

    if collapse_cols is None:
        if collapse_uniq_cols is not None:
            def_cols = standard_cols + [id_col] + collapse_uniq_cols

        else:
            def_cols = standard_cols + [id_col]

        collapse_cols = [col for col in gr.columns if col not in def_cols]

    else:
        assert all([True if col in gr.columns else False for col in collapse_cols])


    return gr.apply(lambda df: _df_collapse_metadata(df,
                                                     id_col,
                                                     standard_cols,
                                                     collapse_cols,
                                                     collapse_uniq_cols,
                                                     collapse_sep
                                                     ))


In [8]:
gtf_cryp_ex_le = get_terminal_regions(gtf_cryp_ex)
gtf_cryp_ex_le

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
0,chr1,HAVANA,exon,62117131,62117393,.,+,.,ENSG00000132849.22,protein_coding,...,protein_coding,PATJ-201,2,basic,OTTHUMT00000488269.2,12,ENSE00003788988.1,,ENSP00000307496.8,
1,chr1,HAVANA,exon,21483142,21485005,.,+,.,ENSG00000142794.19,protein_coding,...,nonsense_mediated_decay,NBPF3-201,2,,OTTHUMT00000008190.3,18,ENSE00001464927.1,,ENSP00000316739.7,
2,chr1,HAVANA,exon,21483142,21484900,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-202,1,CCDS,OTTHUMT00000476523.1,15,ENSE00001814163.1,,ENSP00000316782.5,CCDS216.1
3,chr1,HAVANA,exon,77062974,77064058,.,+,.,ENSG00000117069.15,protein_coding,...,nonsense_mediated_decay,ST6GALNAC5-201,5,,OTTHUMT00000026693.4,5,ENSE00001674568.2,,ENSP00000436263.1,
4,chr1,HAVANA,exon,21483142,21484900,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-203,2,CCDS,OTTHUMT00000008193.1,15,ENSE00001814163.1,,ENSP00000340336.5,CCDS57976.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1413,chrX,HAVANA,exon,131794113,131794466,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-213,,TAGENE,OTTHUMT00000514677.1,4,ENSE00003865783.1,,,
1414,chrX,HAVANA,exon,131689737,131689840,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-214,,TAGENE,OTTHUMT00000512454.1,10,ENSE00003872410.1,,,
1415,chrX,HAVANA,exon,131689754,131692636,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-215,,TAGENE,OTTHUMT00000521770.1,11,ENSE00003880552.1,,,
1416,chrX,HAVANA,exon,131691522,131692636,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-216,,TAGENE,OTTHUMT00000508518.1,13,ENSE00003853517.1,,,


In [9]:
gtf_cryp_ex_le[["gene_name", "transcript_type", "transcript_support_level", "tag", "exon_number"]]

Unnamed: 0,Chromosome,Start,End,Strand,gene_name,transcript_type,transcript_support_level,tag,exon_number
0,chr1,62117131,62117393,+,PATJ,protein_coding,2,basic,12
1,chr1,21483142,21485005,+,NBPF3,nonsense_mediated_decay,2,,18
2,chr1,21483142,21484900,+,NBPF3,protein_coding,1,CCDS,15
3,chr1,77062974,77064058,+,ST6GALNAC5,nonsense_mediated_decay,5,,5
4,chr1,21483142,21484900,+,NBPF3,protein_coding,2,CCDS,15
...,...,...,...,...,...,...,...,...,...
1413,chrX,131794113,131794466,-,FIRRE,lncRNA,,TAGENE,4
1414,chrX,131689737,131689840,-,FIRRE,lncRNA,,TAGENE,10
1415,chrX,131689754,131692636,-,FIRRE,lncRNA,,TAGENE,11
1416,chrX,131691522,131692636,-,FIRRE,lncRNA,,TAGENE,13


In [10]:
tmp_gr = gtf_cryp_ex_le.subset(lambda df: df.gene_name == "SIN3B")
tmp_gr = tmp_gr.cluster(by=["gene_name", "Start", "End"], strand=True)
tmp_gr_orig = collapse_metadata(tmp_gr, id_col="gene_name", collapse_cols=["transcript_id"], collapse_uniq_cols=["gene_name"], collapse_sep=";")
tmp_gr_cluster = collapse_metadata(tmp_gr, id_col="Cluster", collapse_cols=["transcript_id"], collapse_uniq_cols=["gene_name"], collapse_sep=";")
tmp_gr_orig

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,gene_name
0,chr19,16854142,16855745,+,ENST00000596802.5;ENST00000596638.1;ENST000005...,SIN3B


In [11]:
tmp_gr_cluster

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,gene_name
0,chr19,16854142,16855745,+,ENST00000596802.5,SIN3B
1,chr19,16862883,16863286,+,ENST00000596638.1,SIN3B
2,chr19,16865409,16865590,+,ENST00000599880.1,SIN3B
3,chr19,16869459,16869623,+,ENST00000594372.1,SIN3B
4,chr19,16871228,16871638,+,ENST00000602204.1,SIN3B
5,chr19,16872467,16872666,+,ENST00000595900.1,SIN3B
6,chr19,16878496,16878706,+,ENST00000595049.5,SIN3B
7,chr19,16878496,16878942,+,ENST00000594235.1,SIN3B
8,chr19,16878496,16878961,+,ENST00000595541.1,SIN3B
9,chr19,16878496,16879060,+,ENST00000601141.5,SIN3B


In [14]:
tmp_gr[["gene_name", "transcript_id", "Cluster"]]

Unnamed: 0,Chromosome,Start,End,Strand,gene_name,transcript_id,Cluster
0,chr19,16854142,16855745,+,SIN3B,ENST00000596802.5,1
1,chr19,16862883,16863286,+,SIN3B,ENST00000596638.1,2
2,chr19,16865409,16865590,+,SIN3B,ENST00000599880.1,3
3,chr19,16869459,16869623,+,SIN3B,ENST00000594372.1,4
4,chr19,16871228,16871638,+,SIN3B,ENST00000602204.1,5
5,chr19,16872467,16872666,+,SIN3B,ENST00000595900.1,6
6,chr19,16878496,16878706,+,SIN3B,ENST00000595049.5,7
7,chr19,16878496,16878942,+,SIN3B,ENST00000594235.1,8
8,chr19,16878496,16878961,+,SIN3B,ENST00000595541.1,9
9,chr19,16878496,16879060,+,SIN3B,ENST00000601141.5,10


In [15]:
# To create BED file, easier if have unique sequences
# Simplicity - cluster if identical last exon coordinates
gtf_cryp_ex_le = gtf_cryp_ex_le.cluster(by=["gene_name", "Start", "End"], strand=True)
gtf_cryp_ex_le

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,Cluster
0,chr1,HAVANA,exon,54597258,54598858,.,+,.,ENSG00000162390.18,protein_coding,...,ACOT11-205,2,,OTTHUMT00000027353.1,8,ENSE00001846834.1,,,,1
1,chr1,HAVANA,exon,54599295,54599658,.,+,.,ENSG00000162390.18,protein_coding,...,ACOT11-203,3,,OTTHUMT00000027354.1,2,ENSE00001886392.1,,,,2
2,chr1,HAVANA,exon,54608956,54610124,.,+,.,ENSG00000162390.18,protein_coding,...,ACOT11-204,2,,OTTHUMT00000027352.1,15,ENSE00001886798.1,,,,3
3,chr1,HAVANA,exon,54608956,54610329,.,+,.,ENSG00000162390.18,protein_coding,...,ACOT11-201,1,CCDS,OTTHUMT00000027351.1,16,ENSE00001385652.2,,ENSP00000340260.2,CCDS593.1,4
4,chr1,HAVANA,exon,54634687,54639192,.,+,.,ENSG00000162390.18,protein_coding,...,ACOT11-202,1,CCDS,OTTHUMT00000027356.1,17,ENSE00001454930.3,,ENSP00000360366.3,CCDS592.1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1413,chrX,HAVANA,exon,17800048,17802034,.,-,.,ENSG00000131831.18,protein_coding,...,RAI2-204,1,CCDS,OTTHUMT00000504349.2,2,ENSE00003849071.1,,ENSP00000401323.1,CCDS14183.1,1183
1414,chrX,HAVANA,exon,17800050,17802034,.,-,.,ENSG00000131831.18,protein_coding,...,RAI2-201,2,CCDS,OTTHUMT00000055937.1,3,ENSE00001300453.1,,ENSP00000333456.1,CCDS14183.1,1184
1415,chrX,HAVANA,exon,17800050,17802034,.,-,.,ENSG00000131831.18,protein_coding,...,RAI2-202,1,CCDS,OTTHUMT00000055938.1,3,ENSE00001300453.1,,ENSP00000353106.1,CCDS14183.1,1184
1416,chrX,ENSEMBL,exon,17800050,17802034,.,-,.,ENSG00000131831.18,protein_coding,...,RAI2-206,4,CCDS,,3,ENSE00001300453.1,,ENSP00000444210.1,CCDS14183.1,1184


In [16]:
# collapse redundant last exon coordinates, collapsing corresponding transcript_ids and the gene name
gtf_cryp_ex_le_clpsd = collapse_metadata(gtf_cryp_ex_le, id_col="Cluster", collapse_cols=["transcript_id"], collapse_uniq_cols=["gene_name"], collapse_sep=";")
gtf_cryp_ex_le_clpsd

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,gene_name
0,chr1,54597258,54598858,+,ENST00000498228.1,ACOT11
1,chr1,54599295,54599658,+,ENST00000479837.1,ACOT11
2,chr1,54608956,54610124,+,ENST00000481208.5,ACOT11
3,chr1,54608956,54610329,+,ENST00000343744.7,ACOT11
4,chr1,54634687,54639192,+,ENST00000371316.3,ACOT11
...,...,...,...,...,...,...
1180,chrX,40726527,40726851,-,ENST00000463072.1,MED14
1181,chrX,17800048,17801725,-,ENST00000415486.7,RAI2
1182,chrX,17800048,17802034,-,ENST00000451717.6,RAI2
1183,chrX,17800050,17802034,-,ENST00000331511.5;ENST00000360011.5;ENST000005...,RAI2


In [17]:
# Generate a name column, minimal info required for BEd
gtf_cryp_ex_le_clpsd = (gtf_cryp_ex_le_clpsd
                            .assign("annot_status", lambda df: pd.Series(["annotated"]*len(df), index=df.index))
                            .assign("Score", lambda df: pd.Series(["."]*len(df), index=df.index))
                            .assign("Name", lambda df: df.gene_name.str.cat(df[["annot_status", "transcript_id"]].astype(str), sep="|")))

gtf_cryp_ex_le_clpsd

Unnamed: 0,Chromosome,Start,End,Strand,transcript_id,gene_name,annot_status,Score,Name
0,chr1,54597258,54598858,+,ENST00000498228.1,ACOT11,annotated,.,ACOT11|annotated|ENST00000498228.1
1,chr1,54599295,54599658,+,ENST00000479837.1,ACOT11,annotated,.,ACOT11|annotated|ENST00000479837.1
2,chr1,54608956,54610124,+,ENST00000481208.5,ACOT11,annotated,.,ACOT11|annotated|ENST00000481208.5
3,chr1,54608956,54610329,+,ENST00000343744.7,ACOT11,annotated,.,ACOT11|annotated|ENST00000343744.7
4,chr1,54634687,54639192,+,ENST00000371316.3,ACOT11,annotated,.,ACOT11|annotated|ENST00000371316.3
...,...,...,...,...,...,...,...,...,...
1180,chrX,40726527,40726851,-,ENST00000463072.1,MED14,annotated,.,MED14|annotated|ENST00000463072.1
1181,chrX,17800048,17801725,-,ENST00000415486.7,RAI2,annotated,.,RAI2|annotated|ENST00000415486.7
1182,chrX,17800048,17802034,-,ENST00000451717.6,RAI2,annotated,.,RAI2|annotated|ENST00000451717.6
1183,chrX,17800050,17802034,-,ENST00000331511.5;ENST00000360011.5;ENST000005...,RAI2,annotated,.,RAI2|annotated|ENST00000331511.5;ENST000003600...


In [18]:
# create separate BED file for cryptic ALEs and IPAs
ale_cryp_annot_bed = pr.concat([ale_cryp, gtf_cryp_ex_le_clpsd.subset(lambda df: df.gene_name.isin(ale_cryp_gn))[["Name", "Score"]]]).sort()
ipa_cryp_annot_bed = pr.concat([ipa_cryp, gtf_cryp_ex_le_clpsd.subset(lambda df: df.gene_name.isin(ipa_cryp_gn))[["Name", "Score"]]]).sort()

ale_cryp_annot_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,54597258,54598858,ACOT11|annotated|ENST00000498228.1,.,+
1,chr1,54599295,54599658,ACOT11|annotated|ENST00000479837.1,.,+
2,chr1,54608956,54610124,ACOT11|annotated|ENST00000481208.5,.,+
3,chr1,54608956,54610329,ACOT11|annotated|ENST00000343744.7,.,+
4,chr1,54634687,54639192,ENSG00000162390.18_4|ACOT11|spliced|cryptic,.,+
...,...,...,...,...,...,...
1098,chrX,108155613,108157260,COL4A6|annotated|ENST00000334504.12,.,-
1099,chrX,108161630,108161735,COL4A6|annotated|ENST00000487645.1,.,-
1100,chrX,108213791,108214228,COL4A6|annotated|ENST00000468338.5,.,-
1101,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-


In [19]:
ipa_cryp_annot_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,21442239,21442358,NBPF3|annotated|ENST00000475869.2,.,+
1,chr1,21446036,21446482,NBPF3|annotated|ENST00000485941.2,.,+
2,chr1,21451766,21451858,NBPF3|annotated|ENST00000467103.2,.,+
3,chr1,21453372,21454306,NBPF3|annotated|ENST00000478653.6,.,+
4,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+
...,...,...,...,...,...,...
189,chrX,131795049,131796331,FIRRE|annotated|ENST00000650184.1,.,-
190,chrX,131804594,131805037,FIRRE|annotated|ENST00000653136.1,.,-
191,chrX,131823763,131825365,FIRRE|annotated|ENST00000657242.1,.,-
192,chrX,131823775,131825221,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,.,-


In [20]:
outdir = "processed/validation_region_beds"
if not os.path.exists(outdir):
        os.makedirs(outdir)

In [21]:
ale_cryp_annot_bed.to_bed(os.path.join(outdir, "2024-04-05_cryptic_and_annotated.ales.bed"))
ipa_cryp_annot_bed.to_bed(os.path.join(outdir, "2024-04-05_cryptic_and_annotated.ipa.bed"))
three_ext_cryp.to_bed(os.path.join(outdir, "2024-04-05_cryptic_and_annotated.3ext.bed"))