In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from define_helpers import select_rep_site, _define_cryptic_status, select_rep_prox_site, select_rep_five_end
from collections import Counter
import os

In [2]:
# input novel last exons used to generate combined reference of last exons
novel_le = pr.read_gtf("../data/papa/2023-03-29_papa_i3_cortical_upf1_zanovello_overlap_annotated.gtf")
# dexseq results df (used to extract cryptic events of each class)
dexseq_df = pd.read_csv("../data/papa/2023-05-24_i3_cortical_zanovello.all_datasets.dexseq_apa.results.processed.cleaned.tsv", sep="\t")
cryptics_df = pd.read_csv("../../preprocessing/processed/cryptics_summary_all_events.tsv", sep="\t")

# refeence GTF used to identify novel last exons, quantify vs ref
ref_gtf = pr.read_gtf("../data/reference_filtered.gtf")
# 
tx2le = pd.read_csv("../data/papa/novel_ref_combined.tx2le.tsv", sep="\t")

# last exon quantification regions used as input to Salmon
quant_uniq_le = pr.read_gtf("../data/papa/novel_ref_combined.quant.last_exons.gtf")


info_df = pd.read_csv("../data/papa/novel_ref_combined.info.tsv", sep="\t")

In [3]:
# get le_ids that are novel bleedthroughs or ALEs
# le_id_spliced = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
# le_id_bleed = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
dexseq_df.simple_event_type.value_counts()


simple_event_type
spliced                  130806
bleedthrough              41453
distal_3utr_extension     37225
Name: count, dtype: int64

In [4]:
le_id_spliced = dexseq_df.loc[dexseq_df.simple_event_type == "spliced", "le_id"]
le_id_bleed = dexseq_df.loc[dexseq_df.simple_event_type == "bleedthrough", "le_id"]
le_id_d3utr = dexseq_df.loc[dexseq_df.simple_event_type == "distal_3utr_extension", "le_id"]


In [5]:
print(f"Number of spliced ALE events - {len(set(le_id_spliced))}")
print(f"Number of bleedthrough ALE events - {len(set(le_id_bleed))}")
print(f"Number of 3'UTR-ALE events - {len(set(le_id_d3utr))}")

Number of spliced ALE events - 7978
Number of bleedthrough ALE events - 3599
Number of 3'UTR-ALE events - 3369


In [6]:
# how many le_ids have multiple annotations?
le_id_spliced_bleed = set(le_id_spliced).intersection(set(le_id_bleed))
print(f"Number of le_ids with bleedthrough and spliced annotations - {len(le_id_spliced_bleed)}")

Number of le_ids with bleedthrough and spliced annotations - 986


In [7]:
# remvoe le_ids with multiple annotations from list (to avoid ambiguity about annotations)
# le_id_spliced = le_id_spliced[~le_id_spliced.isin(le_id_spliced_bleed)]
# le_id_bleed = le_id_bleed[~le_id_bleed.isin(le_id_spliced_bleed)]
# print(f"Number of spliced ALE events (after removing category-overlapping events) - {le_id_spliced.nunique()}")
# print(f"Number of bleedthrough ALE events (after removing category-overlapping events) - {le_id_bleed.nunique()}")


In [8]:
cryptics_df.simple_event_type.value_counts()

simple_event_type
spliced                               119
distal_3utr_extension                 104
bleedthrough                           55
bleedthrough,spliced                   12
bleedthrough,distal_3utr_extension      3
Name: count, dtype: int64

In [9]:
cryptics_df.annot_status.value_counts()

annot_status
novel              204
annotated           70
annotated,novel     19
Name: count, dtype: int64

In [10]:
cryptics_df[["annot_status","simple_event_type"]].value_counts(sort=False)

annot_status     simple_event_type                 
annotated        bleedthrough                           18
                 bleedthrough,spliced                    4
                 spliced                                48
annotated,novel  bleedthrough,spliced                    6
                 spliced                                13
novel            bleedthrough                           37
                 bleedthrough,distal_3utr_extension      3
                 bleedthrough,spliced                    2
                 distal_3utr_extension                 104
                 spliced                                58
Name: count, dtype: int64

In [11]:
le_id_spliced_cryp = set(cryptics_df.loc[cryptics_df["simple_event_type"] == "spliced", "le_id"])
le_id_bleed_cryp = set(cryptics_df.loc[cryptics_df["simple_event_type"] == "bleedthrough", "le_id"])


print(f"Number of cryptic spliced last exons - {len(le_id_spliced_cryp)}")
print(f"Number of cryptic bleedthrough last exons - {len(le_id_bleed_cryp)}")

Number of cryptic spliced last exons - 119
Number of cryptic bleedthrough last exons - 55


In [12]:
# get a set of novel cryptic IDs so can track where they go
le_id_spliced_novel_cryp = set(cryptics_df[(cryptics_df["annot_status"].isin(["annotated,novel", "novel"])) & (cryptics_df["simple_event_type"] == "spliced")].le_id)
le_id_bleed_novel_cryp = set(cryptics_df[(cryptics_df["annot_status"].isin(["annotated,novel", "novel"])) & (cryptics_df["simple_event_type"] == "bleedthrough")].le_id)

print(f"Number of novel cryptic spliced last exons - {len(le_id_spliced_novel_cryp)}")
print(f"Number of novel cryptic bleedthrough last exons - {len(le_id_bleed_novel_cryp)}")


Number of novel cryptic spliced last exons - 71
Number of novel cryptic bleedthrough last exons - 37


In [13]:
# good idea to ensure don't consider spliced LEs that have a 3'UTR extension
# UPDATE now need to select representative proximal site for each distal site, since quantification approach lumps together all annotated proximal sites (and compares them to extension) 
# since le_ids are annotated sequentially, know that immediately succeeding le_number is the partner le_id
le_id_spliced_spl = le_id_spliced.str.split("_", regex=False, expand=True)
le_id_spliced_spl[1] = le_id_spliced_spl[1].astype(int).add(1)
# reconstruct le_id (corresponding to theoretical distal 3'UTR extension le_id)
le_id_spliced_d3utr = le_id_spliced_spl[0].str.cat(le_id_spliced_spl[1].astype(str), sep="_")

# now have theoretical distal 3'UTR le_ids, intersect with actual ids - any overlapping will be removed from downstream analysis
spliced_d3utr_olap = le_id_spliced_d3utr.isin(le_id_d3utr.values)
# print(f"Number of spliced last exons with 3'UTR extension - {spliced_d3utr_olap.sum()}")

# now use mask to remove these ALEs
le_id_spliced = le_id_spliced[~spliced_d3utr_olap]
print(f"Number of spliced ALE events (after removing those with a novel extended 3'UTR) - {len(set(le_id_spliced))}")


Number of spliced ALE events (after removing those with a novel extended 3'UTR) - 4854


In [14]:
# add le_id used in downstream analysis
novel_le = novel_le.apply(lambda df: df.merge(tx2le, on="transcript_id", how="inner", suffixes=[None, "_quant"]))


In [15]:
len(set(novel_le.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant))
len(set(novel_le.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant)) / len(le_id_spliced_novel_cryp)

1.0

In [16]:
# novel_le.subset(lambda df: df.gene_name_ref.str.contains("STMN2", regex=False))[["Name"]].as_df().to_string()

In [17]:

# Select representative PAS for bleedthroughs and spliced events (novel)
novel_le_rep_spliced, rep_choices_spliced = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))), id_col="le_id_quant")


print(f"Number of novel spliced intervals before selecting representative LEs - {len(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))))}")
print(f"Number of novel spliced intervals after selecting representative LEs - {len(novel_le_rep_spliced)}")
print({dec: len(ids) for dec, ids in rep_choices_spliced.items()})




Number of novel spliced intervals before selecting representative LEs - 6209
Number of novel spliced intervals after selecting representative LEs - 1341
{'atlas_1_pred': 571, 'atlas_max_datasets': 143, 'atlas_max_datasets_shortest': 98, 'motif_1_min': 321, 'motif_shortest_min': 160}


In [18]:
# see if any missing ids
missing_cryp_spl = le_id_spliced_novel_cryp - set(novel_le_rep_spliced.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant)
print(len(missing_cryp_spl))
print(missing_cryp_spl)


3
{'ENSG00000184441.4_1', 'ENSG00000197837.4_1', 'ENSG00000247572.9_4'}


In [19]:
# Select representative PAS for bleedthroughs and spliced events
novel_le_rep_bleed, rep_choices_bleed = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_bleed))), id_col="le_id_quant")

print(f"Number of bleedthrough intervals before selecting representative LEs - {len(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_bleed))))}")
print(f"Number of bleedthrough intervals after selecting representative LEs - {len(novel_le_rep_bleed)}")
print({dec: len(ids) for dec, ids in rep_choices_bleed.items()})

Number of bleedthrough intervals before selecting representative LEs - 5952
Number of bleedthrough intervals after selecting representative LEs - 1353
{'atlas_1_pred': 590, 'atlas_max_datasets': 152, 'atlas_max_datasets_shortest': 105, 'motif_1_min': 350, 'motif_shortest_min': 114}


In [20]:
# see if any missing ids
missing_cryp_bleed = le_id_bleed_novel_cryp - set(novel_le_rep_bleed.subset(lambda df: df.le_id_quant.isin(le_id_bleed_novel_cryp)).le_id_quant)
len(missing_cryp_bleed)

0

In [21]:
# now want to generate an ID with minimal info required for making maps
# 3'end coordinates/last exon, gene name, le_id, site type, regulation status
# site_type - spliced/bleedthrough
# regulation_status - cryptic/background


# define cryptic le_ids
# define cryptic gene_ids
cryp_le_ids = set(cryptics_df.le_id)
cryp_gene_names = set(cryptics_df.gene_name)


# define cryptic & background le_ids
# background - ns in any dataset (to do this, get list of sig in any dataset)
# get gene IDs with no regulated ALEs in any dataset
# returns pd.Series (index = groupIDs)
ns_gene_ids = (dexseq_df.assign(reg_status=lambda df: np.where(df["padj"].le(0.05), 1, 0))
               .groupby("groupID")
               ["reg_status"]
               .sum()
               .loc[lambda x: x == 0]
             )

# set of le_ids that belong to genes with no significant ALEs (is this too conservative?)
ns_le_ids = set(dexseq_df.loc[dexseq_df["groupID"].isin(ns_gene_ids.index), "le_id"])


print(f"Number of cryptic ALE-containing genes - {len(cryp_gene_names)}")
print(f"Number of cryptic ALE isoforms - {len(cryp_gene_names)}")

print(f"Number of ns ALE-containing genes - {len(set(ns_gene_ids.index))}")
print(f"Number of ns ALE isoforms - {len(set(ns_le_ids))}")


Number of cryptic ALE-containing genes - 283
Number of cryptic ALE isoforms - 283
Number of ns ALE-containing genes - 2937
Number of ns ALE isoforms - 6365


In [22]:
# assign cryptic status for novel bleedthroughs
novel_le_rep_bleed = novel_le_rep_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id_quant"))
novel_le_rep_bleed.cryptic_status.value_counts()


cryptic_status
NULL          884
background    416
cryptic        53
Name: count, dtype: int64

In [23]:
# assign cryptic status for novel spliced events
novel_le_rep_spliced = novel_le_rep_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id_quant"))
novel_le_rep_spliced.as_df().drop_duplicates(subset="le_id_quant").cryptic_status.value_counts()

cryptic_status
NULL          814
background    404
cryptic        75
Name: count, dtype: int64

In [24]:
# Construct 'name' string with minimal annotation information
# le_id|gene_name|site_type|cryptic_status

novel_le_rep_spliced = (novel_le_rep_spliced
 .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["gene_name_ref"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
         .assign("site_type",
                 lambda df: pd.Series(["spliced"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id_quant"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
                    
)

novel_le_rep_bleed = (novel_le_rep_bleed
 .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["gene_name_ref"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
         .assign("site_type",
                 lambda df: pd.Series(["bleedthrough"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id_quant"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
                    
)

In [25]:
novel_le_rep_spliced.subset(lambda df: df.cryptic_status == "cryptic")[["le_id","Name"]]

Unnamed: 0,Chromosome,Start,End,Strand,le_id,Name
0,chr1,76871267,76871821,+,ENSG00000117069.15_2,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic
1,chr1,61824444,61825501,+,ENSG00000132849.22_1,ENSG00000132849.22_1|PATJ|spliced|cryptic
2,chr1,245464258,245471621,+,ENSG00000162849.16_2,ENSG00000162849.16_2|KIF26B|spliced|cryptic
3,chr1,112540178,112542090,-,ENSG00000007341.19_4,ENSG00000007341.19_4|ST7L|spliced|cryptic
4,chr1,243612605,243613034,-,ENSG00000117020.19_2,ENSG00000117020.19_2|AKT3|spliced|cryptic
...,...,...,...,...,...,...
71,chr21,45792478,45794699,+,ENSG00000183570.17_3,ENSG00000183570.17_3|PCBP3|spliced|cryptic
72,chr22,37464524,37465718,-,ENSG00000100060.18_2,ENSG00000100060.18_2|MFNG|spliced|cryptic
73,chrX,102721363,102724864,+,ENSG00000198908.12_1,ENSG00000198908.12_1|BHLHB9|spliced|cryptic
74,chrX,98679426,98679978,+,ENSG00000281566.3_1,ENSG00000281566.3_1|ENSG00000281566|spliced|cr...


In [26]:
# for spliced events, want to report the last exon coordinates (also split into 5'end & 3'end)
# for this purpose, current coordiantes (from input last exons) is sufficient.
# but for bleedthrough events, this will contain the complete last exon. So need to return to quant last exons to get the unique regions. 

# novel_le_rep_bleed - extract transcript_id (i.e. represents selected isoform) & downstream le_id (le_id_quant)
novel_rep_bleed_tx2le = novel_le_rep_bleed.as_df()[["transcript_id", "le_id_quant", "Name"]].rename(columns={"le_id_quant": "le_id"})
novel_rep_bleed_tx2le



Unnamed: 0,transcript_id,le_id,Name
0,PAPA.CTRL-2.2002.3,ENSG00000082497.12_1,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...
1,PAPA.doxconc_DOX_0075_2.1123.3,ENSG00000116128.12_2,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic
2,PAPA.doxconc_DOX_0075_1.1061.3,ENSG00000116830.12_2,ENSG00000116830.12_2|TTF2|bleedthrough|background
3,PAPA.TDP43-G_S7.362.1,ENSG00000117682.17_3,ENSG00000117682.17_3|DHDDS|bleedthrough|backgr...
4,PAPA.doxconc_NT_0_3.354.1,ENSG00000117682.17_6,ENSG00000117682.17_6|DHDDS|bleedthrough|backgr...
...,...,...,...
464,PAPA.TDP43-G_S7.20350.2,ENSG00000180182.11_3,ENSG00000180182.11_3|MED14|bleedthrough|cryptic
465,PAPA.TDP-2.25089.1,ENSG00000182518.14_1,ENSG00000182518.14_1|FAM104B|bleedthrough|back...
466,PAPA.doxconc_NT_0_3.19568.4,ENSG00000196459.15_1,ENSG00000196459.15_1|TRAPPC2|bleedthrough|back...
467,PAPA.Cont-D_S4.20482.2,ENSG00000197021.9_2,ENSG00000197021.9_2|EOLA2|bleedthrough|background


In [27]:
for k,v in rep_choices_bleed.items():
    if "ENSG00000082497.12_1" in v:
        print(k)

atlas_max_datasets


In [28]:
# subset to representative bleedthrough txs, joining in Name information
# 5'coord = 1st cooridnate of intron (i.e. where spliced occurs)
# 3'coord = final coordinate of last exon
novel_le_rep_bleed_quant = quant_uniq_le.apply(lambda df: df.merge(novel_rep_bleed_tx2le, on=["transcript_id", "le_id"], how="inner"))

novel_le_rep_bleed_quant[["Score", "Name", "transcript_id", "le_id"]].drop_duplicate_positions()



Unnamed: 0,Chromosome,Start,End,Score,Strand,Name,transcript_id,le_id
0,chr1,210241557,210245767,.,+,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...,PAPA.CTRL-2.2002.3,ENSG00000082497.12_1
1,chr1,210245927,210246813,.,+,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...,PAPA.CTRL-2.2002.3,ENSG00000082497.12_1
2,chr1,147622531,147623356,.,+,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic,PAPA.doxconc_DOX_0075_2.1123.3,ENSG00000116128.12_2
3,chr1,117081947,117082852,.,+,ENSG00000116830.12_2|TTF2|bleedthrough|background,PAPA.doxconc_DOX_0075_1.1061.3,ENSG00000116830.12_2
4,chr1,26438284,26439856,.,+,ENSG00000117682.17_3|DHDDS|bleedthrough|backgr...,PAPA.TDP43-G_S7.362.1,ENSG00000117682.17_3
...,...,...,...,...,...,...,...,...
537,chrX,40653641,40654363,.,-,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,PAPA.TDP43-G_S7.20350.2,ENSG00000180182.11_3
538,chrX,55142867,55146180,.,-,ENSG00000182518.14_1|FAM104B|bleedthrough|back...,PAPA.TDP-2.25089.1,ENSG00000182518.14_1
539,chrX,149931286,149932588,.,-,ENSG00000197021.9_2|EOLA2|bleedthrough|background,PAPA.Cont-D_S4.20482.2,ENSG00000197021.9_2
540,chrX,131823777,131825221,.,-,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,PAPA.TDP43_ctrl_2.26896.10,ENSG00000213468.7_1


### Repeat for annotated

In [29]:
# get le_ids in which all transcripts are annotated
annotated_le_ids = dexseq_df.groupby("le_id").filter(lambda df: (df.annot_status == "annotated").all()).le_id
le_id_bleed_annot = set(le_id_bleed).intersection(set(annotated_le_ids))
le_id_spliced_annot = set(le_id_spliced).intersection(set(annotated_le_ids))
print(f"Number of annotated bleedthrough last exons (assessed by DEXSeq) - {len(le_id_bleed_annot)}")
print(f"Number of annotated spliced last exons (assessed by DEXSeq) - {len(le_id_spliced_annot)}")

Number of annotated bleedthrough last exons (assessed by DEXSeq) - 2288
Number of annotated spliced last exons (assessed by DEXSeq) - 3561


In [30]:
# subset quantification GTF for spliecd last exons 
ref_le_spliced = quant_uniq_le.subset(lambda df: df.le_id.isin(le_id_spliced_annot))
# select representative isoform for each last exon ID (most distal 3'end)
ref_le_spliced = select_rep_prox_site(ref_le_spliced, "le_id")

# subset quantification GTF for bleedthrough last exons 
ref_le_bleed = quant_uniq_le.subset(lambda df: df.le_id.isin(le_id_bleed_annot))
# select representative isoform for each last exon ID (most distal 3'end)
ref_le_bleed = select_rep_prox_site(ref_le_bleed, "le_id")

In [31]:
# assign cryptic status for annotatesd spliced last exons
ref_le_spliced = ref_le_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id"))
ref_le_spliced.cryptic_status.value_counts()


cryptic_status
NULL          2275
background    2176
cryptic         39
Name: count, dtype: int64

In [32]:
# assign cryptic status for annotatesd bleedthrough last exons
ref_le_bleed = ref_le_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id"))
ref_le_bleed.cryptic_status.value_counts()

cryptic_status
background    1382
NULL          1261
cryptic         24
Name: count, dtype: int64

In [33]:
ref_le_spliced[["ref_gene_name"]]

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name
0,chr1,109214546,109215153,+,SARS1
1,chr1,69873466,69875004,+,LRRC7
2,chr1,6634019,6635586,+,THAP3
3,chr1,7779625,7781432,+,VAMP3
4,chr1,243305011,243306548,+,SDCCAG8
...,...,...,...,...,...
4485,chrY,12859278,12860839,+,USP9Y
4486,chrY,18932316,18932841,-,TTTY14
4487,chrY,18872500,18872834,-,TTTY14
4488,chrY,18872500,18872834,-,TTTY14


In [34]:
x = {'ENSG00000004866.22_2',
 'ENSG00000051825.15_5',
 'ENSG00000103248.19_2',
 'ENSG00000162390.18_4',
 'ENSG00000173818.17_2',
 'ENSG00000174165.8_3',
 'ENSG00000181722.17_5',
 'ENSG00000196275.16_6',
 'ENSG00000197497.11_3',
 'ENSG00000227110.7_3',
 'ENSG00000255545.8_2'}

ref_le_spliced.subset(lambda df: df.le_id.isin(x)).cryptic_status.value_counts()

cryptic_status
cryptic    11
Name: count, dtype: int64

In [35]:
# assign Name field with minimal info
ref_le_spliced = (ref_le_spliced
  .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["ref_gene_name"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
         .assign("site_type",
                 lambda df: pd.Series(["spliced"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
)

ref_le_bleed = (ref_le_bleed
 .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["ref_gene_name"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
.assign("site_type",
                 lambda df: pd.Series(["bleedthrough"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
)

In [36]:
ref_le_spliced.subset(lambda df: df.le_id.isin(x)).cryptic_status.value_counts()

cryptic_status
cryptic    11
Name: count, dtype: int64

In [37]:
# since last_exon_spliced annotations includes most distal last exon of a gene, let's compare these ALEs to non-terminal ALEs (i.e. there is a competing splicign event)
print(ref_le_spliced.event_type.value_counts())

# ref_le_spliced = ref_le_spliced.subset(lambda df: df["event_type"].ne("last_exon_spliced"))
# ref_le_spliced
      

event_type
last_exon_spliced                           1190
first_exon_spliced                           525
internal_exon_spliced                        247
internal_exon_extension                      106
first_exon_spliced,internal_exon_spliced     106
first_exon_extension                          41
Name: count, dtype: int64


In [38]:
ref_le_spliced.subset(lambda df: df.le_id.isin(x))

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id,cryptic_status,gene_name_ref,site_type,Name
0,chr1,.,exon,54634687,54639192,.,+,.,ENSG00000162390.18,ACOT11,...,,last_exon_spliced,ENSG00000162390.18,ACOT11,4.0,ENSG00000162390.18_4,cryptic,ACOT11,spliced,ENSG00000162390.18_4|ACOT11|spliced|cryptic
1,chr3,.,exon,114314500,114339426,.,-,.,ENSG00000181722.17,ZBTB20,...,"NULL,NULL,NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000181722.17,ZBTB20,5.0,ENSG00000181722.17_5,cryptic,ZBTB20,spliced,ENSG00000181722.17_5|ZBTB20|spliced|cryptic
2,chr3,.,exon,8193136,8195073,.,-,.,ENSG00000227110.7,LMCD1-AS1,...,"NULL,NULL",last_exon_spliced,ENSG00000227110.7,LMCD1-AS1,3.0,ENSG00000227110.7_3,cryptic,LMCD1-AS1,spliced,ENSG00000227110.7_3|LMCD1-AS1|spliced|cryptic
3,chr7,.,exon,117222860,117223907,.,+,.,ENSG00000004866.22,ST7,...,"NULL,NULL,NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000004866.22,ST7,2.0,ENSG00000004866.22_2,cryptic,ST7,spliced,ENSG00000004866.22_2|ST7|spliced|cryptic
4,chr7,.,exon,74796150,74798265,.,-,.,ENSG00000196275.16,GTF2IRD2,...,,last_exon_spliced,ENSG00000196275.16,GTF2IRD2,6.0,ENSG00000196275.16_6,cryptic,GTF2IRD2,spliced,ENSG00000196275.16_6|GTF2IRD2|spliced|cryptic
5,chr11,.,exon,134502943,134505661,.,+,.,ENSG00000255545.8,B3GAT1-DT,...,"NULL,NULL,NULL",last_exon_spliced,ENSG00000255545.8,B3GAT1-DT,2.0,ENSG00000255545.8_2,cryptic,B3GAT1-DT,spliced,ENSG00000255545.8_2|B3GAT1-DT|spliced|cryptic
6,chr11,.,exon,66520636,66521466,.,-,.,ENSG00000174165.8,ZDHHC24,...,,last_exon_spliced,ENSG00000174165.8,ZDHHC24,3.0,ENSG00000174165.8_3,cryptic,ZDHHC24,spliced,ENSG00000174165.8_3|ZDHHC24|spliced|cryptic
7,chr12,.,exon,123156635,123156908,.,-,.,ENSG00000051825.15,MPHOSPH9,...,"NULL,NULL",last_exon_spliced,ENSG00000051825.15,MPHOSPH9,5.0,ENSG00000051825.15_5,cryptic,MPHOSPH9,spliced,ENSG00000051825.15_5|MPHOSPH9|spliced|cryptic
8,chr16,.,exon,86540719,86541198,.,-,.,ENSG00000103248.19,MTHFSD,...,"NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000103248.19,MTHFSD,2.0,ENSG00000103248.19_2,cryptic,MTHFSD,spliced,ENSG00000103248.19_2|MTHFSD|spliced|cryptic
9,chr17,.,exon,80420049,80420533,.,+,.,ENSG00000173818.17,ENDOV,...,,last_exon_spliced,ENSG00000173818.17,ENDOV,2.0,ENSG00000173818.17_2,cryptic,ENDOV,spliced,ENSG00000173818.17_2|ENDOV|spliced|cryptic


In [39]:
# combined output BEDs
spliced_out_bed = pr.concat([novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_spliced[["Score", "Name"]].drop_duplicate_positions()])

bleed_out_bed = pr.concat([novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_bleed[["Score", "Name"]].drop_duplicate_positions()])

spliced_out_bed.subset(lambda df: df.Name.str.contains("cryptic$"))

# novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions()
# novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions()


# generate output bed file of proximal and distal PAS
# utr3_out_bed = pr.concat([sel_le_d3utr_prox[["Score", "Name"]],
#                           sel_novel_le_d3utr[["Score", "Name"]]]
#                           )

# print(len(utr3_out_bed))
# # drop duplicate entries
# utr3_out_bed = utr3_out_bed.apply(lambda df: df.drop_duplicates()).sort()

# utr3_out_bed

Unnamed: 0,Chromosome,Start,End,Score,Strand,Name
0,chr1,76871267,76871821,.,+,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic
1,chr1,61824444,61825501,.,+,ENSG00000132849.22_1|PATJ|spliced|cryptic
2,chr1,245464258,245471621,.,+,ENSG00000162849.16_2|KIF26B|spliced|cryptic
3,chr1,54634687,54639192,.,+,ENSG00000162390.18_4|ACOT11|spliced|cryptic
4,chr1,1616614,1619210,.,+,ENSG00000197530.13_1|MIB2|spliced|cryptic
...,...,...,...,...,...,...
103,chr22,37464524,37465718,.,-,ENSG00000100060.18_2|MFNG|spliced|cryptic
104,chrX,102721363,102724864,.,+,ENSG00000198908.12_1|BHLHB9|spliced|cryptic
105,chrX,98679426,98679978,.,+,ENSG00000281566.3_1|ENSG00000281566|spliced|cr...
106,chrX,17835910,17837395,.,-,ENSG00000131831.18_1|RAI2|spliced|cryptic


In [40]:
# which cryptics are missing from each df
missing_cryp_spl_all = le_id_spliced_cryp.difference(set(spliced_out_bed.subset(lambda df: df.Name.str.contains("cryptic$")).Name.str.split("|", expand=True)[0]))
print(f"fraction of cryptic spliced events retained in output - {len(le_id_spliced_cryp.intersection(set(spliced_out_bed.subset(lambda df: df.Name.str.contains('cryptic$')).Name.str.split('|', expand=True)[0]))) / len(le_id_spliced_cryp)}")
print(f"number of missing ids - {len(missing_cryp_spl_all)}")

fraction of cryptic spliced events retained in output - 0.8067226890756303
number of missing ids - 23


In [41]:
# how many missing spliced events are novel?
missing_cryp_spl_all_novel = set(novel_le.subset(lambda df: df.le_id_quant.isin(missing_cryp_spl_all)).le_id_quant)
print(len(missing_cryp_spl_all_novel))
novel_le.subset(lambda df: df.le_id_quant.isin(missing_cryp_spl_all_novel))


3


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,...,gene_id_ref,mean_PPAU_base,mean_PPAU_treatment,delta_PPAU_treatment_control,experiment_id,Cluster,experiment_count,multiple_datasets,max_mean_PPAU,le_id_quant
0,chr5,.,exon,81199922,81203527,.,-,.,PAPA.CTRL-1.6948,PAPA.CTRL-1.6948.2,...,ENSG00000247572.9,0.0614445669103058,0.0495702580721974,-0.0118743088381084,humphrey_i3_cortical,2471,3,1,0.0614445669103058,ENSG00000247572.9_4
1,chr5,.,exon,81199922,81203527,.,-,.,PAPA.CTRL-1.6948,PAPA.CTRL-1.6948.2,...,ENSG00000247572.9,0.0614445669103058,0.0495702580721974,-0.0118743088381084,humphrey_i3_cortical,2471,3,1,0.0614445669103058,ENSG00000247572.9_4
2,chr5,.,exon,81200328,81203527,.,-,.,PAPA.doxconc_DOX_0075_1.5662,PAPA.doxconc_DOX_0075_1.5662.1,...,ENSG00000247572.9,0.141934336874005,0.25808703104677,0.116152694172765,zanovello_shsy5y_curve_0075,2471,3,1,0.25808703104677,ENSG00000247572.9_4
3,chr12,.,exon,14762507,14769009,.,-,.,PAPA.ctrl_ctrl_2.16825,PAPA.ctrl_ctrl_2.16825.4,...,ENSG00000197837.4,0.0877670042341209,0.0,-0.0877670042341209,zanovello_i3_cortical_upf1_tdp_tdpkd_upf1ctl_v...,4998,1,0,0.0877670042341209,ENSG00000197837.4_1
4,chr12,.,exon,14762507,14769009,.,-,.,PAPA.ctrl_ctrl_2.16825,PAPA.ctrl_ctrl_2.16825.4,...,ENSG00000197837.4,0.0877670042341209,0.0,-0.0877670042341209,zanovello_i3_cortical_upf1_tdp_tdpkd_upf1ctl_v...,4998,1,0,0.0877670042341209,ENSG00000197837.4_1
5,chr21,.,exon,44333236,44337744,.,+,.,PAPA.TDP-1.23931,PAPA.TDP-1.23931.2,...,ENSG00000184441.4,1.0,1.0,0.0,humphrey_i3_cortical,7247,4,1,1.0,ENSG00000184441.4_1


In [42]:
# how many missing cryptic spliced events? partners to 3'UTR extensions
le_id_d3utr_spliced_prtnr = set(dexseq_df.loc[dexseq_df.simple_event_type == "spliced", "le_id"][spliced_d3utr_olap].drop_duplicates())
print(len(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr)))
",".join(dexseq_df[dexseq_df.le_id.isin(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr))].gene_name.drop_duplicates())

23


'LINC02202,ZKSCAN7,CKMT2-AS1,ENSG00000184441,TCP11L2,ENSG00000216895,NOTCH2NLC,WDR33,ABLIM2,H4-16,ST7L,RIPK1,RSKR,FBXW8,LINC01006,RRP7A,LCMT1-AS1,TAGAP-AS1,HHIPL1,RET,LINC00476,ENSG00000230074,SNHG31'

In [43]:
# above 23 are probs better off not including in spliced ( & rather as cryptic 3'UTR changes, as more likely to be reg at site level rather than SJ)
# of the 3 novel, are they partners to 3'UTR extensions? (NB: how?)
missing_cryp_spl_all_novel.intersection(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr))

{'ENSG00000184441.4_1', 'ENSG00000197837.4_1', 'ENSG00000247572.9_4'}

In [44]:
# of the 34, are the remaining events all annotated?
missing_cryp_spl_all_n_d3utr_prtnr = missing_cryp_spl_all.difference(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr))
print(len(missing_cryp_spl_all_n_d3utr_prtnr))
missing_cryp_spl_all_n_d3utr_prtnr.difference(set(ref_le_spliced.le_id))

0


set()

In [45]:
dexseq_df[dexseq_df.le_id.isin(missing_cryp_spl_all_n_d3utr_prtnr)][["le_id", "gene_name", "annot_status", "event_type", "simple_event_type"]].drop_duplicates()

Unnamed: 0,le_id,gene_name,annot_status,event_type,simple_event_type


In [46]:
spliced_out_bed.subset(lambda df: df.Name.str.contains("STMN2", regex=False))

Unnamed: 0,Chromosome,Start,End,Score,Strand,Name
0,chr8,79616821,79617071,.,+,ENSG00000104435.14_1|STMN2|spliced|cryptic


In [47]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_spliced.items()])

Counter({None: 365, 'atlas_1_pred': 10, 'atlas_max_datasets': 5})

In [48]:
spliced_dupe = [le_id.split("|")[0] for  le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index 
    if le_id.split("|")[0] in set(rep_choices_spliced["atlas_1_pred"]).union(set(rep_choices_spliced["atlas_max_datasets"]))
    ]

spliced_dupe

['ENSG00000283183.3_1',
 'ENSG00000120251.21_2',
 'ENSG00000083223.18_2',
 'ENSG00000258053.1_1',
 'ENSG00000215022.8_1',
 'ENSG00000133460.20_4',
 'ENSG00000251573.2_2',
 'ENSG00000251562.9_1',
 'ENSG00000155034.20_2',
 'ENSG00000230590.12_4',
 'ENSG00000139437.18_1',
 'ENSG00000197969.14_5',
 'ENSG00000131969.15_1',
 'ENSG00000225546.6_3',
 'ENSG00000140481.15_4']

In [49]:
novel_le_rep_spliced.subset(lambda df: df.le_id_quant.isin(spliced_dupe))[["le_id_quant", "event_type", "site_type", "cryptic_status"]]

Unnamed: 0,Chromosome,Start,End,Strand,le_id_quant,event_type,site_type,cryptic_status
0,chr4,157362798,157367162,+,ENSG00000120251.21_2,internal_exon_spliced,spliced,background
1,chr4,157361009,157367162,+,ENSG00000120251.21_2,internal_exon_spliced,spliced,background
2,chr4,578719,582368,+,ENSG00000283183.3_1,internal_exon_extension,spliced,background
3,chr4,582123,582368,+,ENSG00000283183.3_1,last_exon_spliced,spliced,background
4,chr5,50965151,50965903,-,ENSG00000251573.2_2,first_exon_spliced,spliced,background
5,chr5,50965151,50969108,-,ENSG00000251573.2_2,first_exon_spliced,spliced,background
6,chr6,13290012,13294898,-,ENSG00000215022.8_1,internal_exon_extension,spliced,background
7,chr6,13290012,13290585,-,ENSG00000215022.8_1,internal_exon_spliced,spliced,background
8,chr7,5475798,5481931,-,ENSG00000155034.20_2,"internal_exon_spliced,last_exon_spliced",spliced,background
9,chr7,5475796,5481931,-,ENSG00000155034.20_2,"internal_exon_spliced,last_exon_spliced",spliced,background


In [50]:
# all occur where 3' coordinate is the same, but differ in their 5'coordinates
# 1 - keep all possible 5' & 3' coordinates?
# 2 - keep the shortest one (similar to how I have filtered previously)

In [51]:
bleed_out_bed.subset(lambda df: df.Name.str.contains("cryptic$"))

Unnamed: 0,Chromosome,Start,End,Score,Strand,Name
0,chr1,147622531,147623356,.,+,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic
1,chr1,21453372,21457150,.,+,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic
2,chr1,156139102,156140091,.,+,ENSG00000160789.24_11|LMNA|bleedthrough|cryptic
3,chr1,149834134,149835093,.,+,ENSG00000270882.3_2|H4C14|bleedthrough|cryptic
4,chr1,45013791,45013889,.,+,ENSG00000126088.14_1|UROD|bleedthrough|cryptic
...,...,...,...,...,...,...
73,chr21,42849225,42850040,.,-,ENSG00000160193.12_1|WDR4|bleedthrough|cryptic
74,chrX,91882906,91891321,.,+,ENSG00000102290.23_3|PCDH11X|bleedthrough|cryptic
75,chrX,107087378,107088435,.,-,ENSG00000089682.17_2|RBM41|bleedthrough|cryptic
76,chrX,40653641,40654363,.,-,ENSG00000180182.11_3|MED14|bleedthrough|cryptic


In [52]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_bleed.items()])

Counter({None: 502,
         'atlas_1_pred': 24,
         'motif_shortest_min': 18,
         'atlas_max_datasets': 17,
         'motif_1_min': 7,
         'atlas_max_datasets_shortest': 2})

In [53]:
bleed_dupe = [le_id.split("|")[0] for  le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index 
    if le_id.split("|")[0] in set(rep_choices_bleed["atlas_1_pred"]).union(set(rep_choices_bleed["atlas_max_datasets"]).union(set(rep_choices_bleed["motif_1_min"])))
    ]

bleed_dupe

['ENSG00000155975.10_3',
 'ENSG00000178338.11_1',
 'ENSG00000251562.9_1',
 'ENSG00000131969.15_1',
 'ENSG00000133460.20_4',
 'ENSG00000139437.18_1',
 'ENSG00000242220.9_2',
 'ENSG00000250337.8_1',
 'ENSG00000120251.21_2',
 'ENSG00000100554.12_3',
 'ENSG00000147421.18_2',
 'ENSG00000139971.15_3',
 'ENSG00000106052.14_4',
 'ENSG00000002746.15_3',
 'ENSG00000174485.16_4',
 'ENSG00000246695.9_2',
 'ENSG00000258053.1_1',
 'ENSG00000163075.13_2',
 'ENSG00000175764.17_3',
 'ENSG00000283183.3_1',
 'ENSG00000258984.5,ENSG00000184182.19_2',
 'ENSG00000070182.21_1',
 'ENSG00000236255.2_1',
 'ENSG00000083223.18_2',
 'ENSG00000236753.7_1',
 'ENSG00000197928.11_1',
 'ENSG00000189144.15_2',
 'ENSG00000178381.12_2',
 'ENSG00000104957.14_2',
 'ENSG00000126790.12_2',
 'ENSG00000114331.16_4',
 'ENSG00000151276.24_3',
 'ENSG00000140481.15_4',
 'ENSG00000120158.12_3',
 'ENSG00000133114.18_2',
 'ENSG00000100926.15,ENSG00000254692.1_1',
 'ENSG00000082497.12_1',
 'ENSG00000120833.14_1',
 'ENSG00000225138.8_2'

In [54]:
novel_le_rep_bleed.subset(lambda df: df.le_id_quant.isin(bleed_dupe))[["le_id_quant", "event_type", "site_type", "cryptic_status"]]

Unnamed: 0,Chromosome,Start,End,Strand,le_id_quant,event_type,site_type,cryptic_status
0,chr1,210241557,210246813,+,ENSG00000082497.12_1,"internal_exon_spliced,last_exon_spliced",bleedthrough,background
1,chr2,119601217,119603742,+,ENSG00000163075.13_2,internal_exon_spliced,bleedthrough,background
2,chr2,117837657,117843209,+,ENSG00000236255.2_1,last_exon_spliced,bleedthrough,background
3,chr2,238041287,238046154,+,"ENSG00000258984.5,ENSG00000184182.19_2","first_exon_spliced,internal_exon_spliced,last_...",bleedthrough,background
4,chr3,195279689,195285857,-,ENSG00000114331.16_4,"internal_exon_spliced,last_exon_spliced",bleedthrough,background
...,...,...,...,...,...,...,...,...
58,chr19,53238587,53242440,-,ENSG00000197928.11_1,last_exon_spliced,bleedthrough,background
59,chr21,32573398,32576923,-,ENSG00000242220.9_2,last_exon_extension,bleedthrough,background
60,chr22,23877384,23878245,+,ENSG00000133460.20_4,internal_exon_spliced,bleedthrough,background
61,chr22,23877041,23878245,+,ENSG00000133460.20_4,"first_exon_spliced,internal_exon_spliced,last_...",bleedthrough,background


In [55]:
# select representative 5'end coord (the shoretest) for a given le_id
bleed_out_bed = select_rep_five_end(bleed_out_bed, id_col = "Name")
spliced_out_bed = select_rep_five_end(spliced_out_bed, id_col = "Name")

In [56]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_bleed.items()])

Counter({None: 4, 'atlas_max_datasets': 1})

In [57]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_spliced.items()])

Counter({None: 8, 'atlas_1_pred': 2})

In [62]:
[le_id.split("|")[0] for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index if le_id.split("|")[0] in rep_choices_spliced["atlas_1_pred"]]

[]

In [58]:
spliced_out_bed.Name.str.split("|", expand=True)[3].value_counts()

3
background    2117
cryptic        107
Name: count, dtype: int64

In [59]:
bleed_out_bed.Name.str.split("|", expand=True)[[3]].value_counts()

3         
background    1584
cryptic         74
Name: count, dtype: int64

In [60]:
if not os.path.exists("../processed/iclip_regions"):
    os.makedirs("../processed/iclip_regions")

#### TEMPORARY - work out why duplicate regions/multiple PAS coords for some last exons. To prevent duplciats slect longest per interval
spliced_out_bed = select_rep_prox_site(spliced_out_bed, "Name")
bleed_out_bed = select_rep_prox_site(bleed_out_bed, "Name")

spliced_out_bed.to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_spliced.last_exons.bed")
# also make one with just polYA sites
spliced_out_bed.three_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_spliced.pas.bed")
# also first coordinate of exon (i.e. interval ending at this coordinate = splice site end, or substract 1 (strand-aware) to get final nucleotide of splice site)
spliced_out_bed.five_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_spliced.le_start.bed")

bleed_out_bed.to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_bleedthrough.last_exons.bed")
# also make one with just polYA sites
bleed_out_bed.three_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_bleedthrough.pas.bed")
# also first coordinate of exon (i.e. 1st nucleotide of intron
bleed_out_bed.five_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_bleedthrough.le_start.bed")