In [2]:
import pyranges as pr
import pandas as pd
import numpy as np

In [4]:
le_full = pr.read_gtf("../data/PAPA/novel_ref_combined.last_exons.gtf")
tx2le = pd.read_csv("../data/PAPA/novel_ref_combined.tx2le.tsv", sep="\t")
dexseq = pd.read_csv("../data/PAPA/2023-05-24_i3_cortical_zanovello.all_datasets.dexseq_apa.results.processed.cleaned.tsv", sep="\t")


In [5]:
# subset for cryptic events
cryptic = dexseq[(dexseq["padj"] < 0.05) & (dexseq["mean_PPAU_base"] < 0.1) & (dexseq["delta_PPAU_treatment_control"] > 0.1)]
cryptic_le_ids = set(cryptic.le_id)

cryptic.le_id.nunique()

293

In [27]:
# Subset full last exons GTF for cryptic le_ids
le_full_cryptic = le_full.subset(lambda df: df.le_id.isin(cryptic_le_ids))
le_full_cryptic

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,15939295,15940477,.,+,.,PAPA.chx_tdp_DOX_ctrl_2.224,,...,,,,,,last_exon_extension,ENSG00000065526.12,SPEN,4.0,ENSG00000065526.12_4
1,chr1,.,exon,40421975,40423342,.,+,.,PAPA.TDP-6.631,,...,,,,,,last_exon_extension,ENSG00000084070.12,SMAP2,2.0,ENSG00000084070.12_2
2,chr1,.,exon,147622270,147623356,.,+,.,PAPA.doxconc_DOX_0075_2.1123,,...,,,,,,internal_exon_extension,ENSG00000116128.12,BCL9,2.0,ENSG00000116128.12_2
3,chr1,.,exon,147622270,147622693,.,+,.,PAPA.chx_tdp_DOX_ctrl_2.1186,,...,,,,,,internal_exon_extension,ENSG00000116128.12,BCL9,2.0,ENSG00000116128.12_2
4,chr1,.,exon,147622270,147623701,.,+,.,PAPA.doxconc_DOX_0075_1.1139,,...,,,,,,internal_exon_extension,ENSG00000116128.12,BCL9,2.0,ENSG00000116128.12_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,chrX,.,exon,40653676,40654556,.,-,.,PAPA.TDP43_ctrl_3.26410,,...,,,,,,last_exon_extension,ENSG00000180182.11,MED14,3.0,ENSG00000180182.11_3
1119,chrX,.,exon,131823775,131825365,.,-,.,PAPA.TDP43-F_S6.20505,,...,,,,,,internal_exon_extension,ENSG00000213468.7,FIRRE,1.0,ENSG00000213468.7_1
1120,chrX,.,exon,131823777,131825365,.,-,.,PAPA.TDP43_ctrl_2.26896,,...,,,,,,internal_exon_extension,ENSG00000213468.7,FIRRE,1.0,ENSG00000213468.7_1
1121,chrX,.,exon,108267641,108269152,.,-,.,ENSG00000197565.17,COL4A6,...,last,108221374108221374108221374,108310747108310747108310747,"ENST00000372216.8,ENST00000538570.5,ENST000006...","NULL,NULL,NULL",internal_exon_spliced,ENSG00000197565.17,COL4A6,1.0,ENSG00000197565.17_1


In [28]:
# Add in simple event type annotation for each last exon
# Annotated events are assigned event types
# bleedthroughs are properly annotated

def collapse_vals(s):

    return ",".join(s.drop_duplicates().sort_values())

# cryptic[["le_id", "annot_status", "gene_name", "simple_event_type"]].drop_duplicates()
# generate table mapping le_id to collapsed identifiers (in case of multiple event type annotations, collapse to comma separated string)
le2event = cryptic.groupby("le_id")[["annot_status", "simple_event_type"]].agg(collapse_vals).reset_index()
print(le2event[["annot_status", "simple_event_type"]].value_counts())


le_full_cryptic = le_full_cryptic.apply(lambda df: df.merge(le2event, on="le_id", how="left", suffixes=[None, "_simple"]).rename(columns={"simple_event_type": "event_type_simple"}))
le_full_cryptic

annot_status     simple_event_type                 
novel            distal_3utr_extension                 104
                 spliced                                58
annotated        spliced                                48
novel            bleedthrough                           37
annotated        bleedthrough                           18
annotated,novel  spliced                                13
                 bleedthrough,spliced                    6
annotated        bleedthrough,spliced                    4
novel            bleedthrough,distal_3utr_extension      3
                 bleedthrough,spliced                    2
Name: count, dtype: int64


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id,annot_status,event_type_simple
0,chr1,.,exon,15939295,15940477,.,+,.,PAPA.chx_tdp_DOX_ctrl_2.224,,...,,,,last_exon_extension,ENSG00000065526.12,SPEN,4.0,ENSG00000065526.12_4,novel,distal_3utr_extension
1,chr1,.,exon,40421975,40423342,.,+,.,PAPA.TDP-6.631,,...,,,,last_exon_extension,ENSG00000084070.12,SMAP2,2.0,ENSG00000084070.12_2,novel,distal_3utr_extension
2,chr1,.,exon,147622270,147623356,.,+,.,PAPA.doxconc_DOX_0075_2.1123,,...,,,,internal_exon_extension,ENSG00000116128.12,BCL9,2.0,ENSG00000116128.12_2,novel,bleedthrough
3,chr1,.,exon,147622270,147622693,.,+,.,PAPA.chx_tdp_DOX_ctrl_2.1186,,...,,,,internal_exon_extension,ENSG00000116128.12,BCL9,2.0,ENSG00000116128.12_2,novel,bleedthrough
4,chr1,.,exon,147622270,147623701,.,+,.,PAPA.doxconc_DOX_0075_1.1139,,...,,,,internal_exon_extension,ENSG00000116128.12,BCL9,2.0,ENSG00000116128.12_2,novel,bleedthrough
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,chrX,.,exon,40653676,40654556,.,-,.,PAPA.TDP43_ctrl_3.26410,,...,,,,last_exon_extension,ENSG00000180182.11,MED14,3.0,ENSG00000180182.11_3,novel,bleedthrough
1119,chrX,.,exon,131823775,131825365,.,-,.,PAPA.TDP43-F_S6.20505,,...,,,,internal_exon_extension,ENSG00000213468.7,FIRRE,1.0,ENSG00000213468.7_1,novel,bleedthrough
1120,chrX,.,exon,131823777,131825365,.,-,.,PAPA.TDP43_ctrl_2.26896,,...,,,,internal_exon_extension,ENSG00000213468.7,FIRRE,1.0,ENSG00000213468.7_1,novel,bleedthrough
1121,chrX,.,exon,108267641,108269152,.,-,.,ENSG00000197565.17,COL4A6,...,108310747108310747108310747,"ENST00000372216.8,ENST00000538570.5,ENST000006...","NULL,NULL,NULL",internal_exon_spliced,ENSG00000197565.17,COL4A6,1.0,ENSG00000197565.17_1,annotated,spliced


In [29]:
le_full_cryptic.to_gtf("../processed/2023-06-20_cryptic_last_exons.gtf")