In [31]:
import pyranges as pr
import pandas as pd
import numpy as np
from define_helpers import select_rep_site, _define_cryptic_status, select_rep_prox_site, select_rep_five_end
from collections import Counter
import os

In [32]:
# input novel last exons used to generate combined reference of last exons (full last exon sequence, not just subtracted region)
novel_le = pr.read_gtf("../data/papa/2023-03-29_papa_i3_cortical_upf1_zanovello_overlap_annotated.gtf")
# dexseq results df (used to extract cryptic events of each class)
dexseq_df = pd.read_csv("../data/papa/2023-05-24_i3_cortical_zanovello.all_datasets.dexseq_apa.results.processed.cleaned.tsv", sep="\t")


# reference GTF used to identify novel last exons, quantify vs ref
ref_gtf = pr.read_gtf("../data/reference_filtered.gtf")
# 
tx2le = pd.read_csv("../data/papa/novel_ref_combined.tx2le.tsv", sep="\t")

# last exon quantification regions used as input to Salmon
quant_uniq_le = pr.read_gtf("../data/papa/novel_ref_combined.quant.last_exons.gtf")


info_df = pd.read_csv("../data/papa/novel_ref_combined.info.tsv", sep="\t")

In [33]:
# summary df of cryptic events following manual validation of bleedthrough events
cryptics_df = pd.read_csv("../../preprocessing/processed/cryptics_summary_all_events_bleedthrough_manual_validation.tsv", sep="\t")

# load in IDs to remove entirely from analysis (remove as cryptic, but make sure nto included in overall background)
with open("../../preprocessing/processed/cryptics_manual_validation_fail_le_ids.txt", "r") as infile:
    manual_validation_fail_ids = [line.rstrip("\n") for line in infile]

# get a dict of set of le_ids in each case
event_type_le_ids = cryptics_df.groupby("simple_event_type")["le_id"].agg(set).to_dict()

for event_type, ids in event_type_le_ids.items():
    print(f"Event type - {event_type} - number of IDs - {len(ids)}")

print(f"Number of cryptic IDs failing manual validation - {len(manual_validation_fail_ids)}")

Event type - bleedthrough - number of IDs - 21
Event type - bleedthrough,distal_3utr_extension - number of IDs - 3
Event type - bleedthrough,spliced - number of IDs - 12
Event type - distal_3utr_extension - number of IDs - 104
Event type - spliced - number of IDs - 119
Number of cryptic IDs failing manual validation - 34


In [34]:
# filter out failed IDs from dexseq df (so not included as background)
dexseq_df = dexseq_df[~dexseq_df["le_id"].isin(manual_validation_fail_ids)]
dexseq_df

Unnamed: 0,experiment_name,binID,groupID,featureID,exonBaseMean,dispersion,stat,pvalue,padj,UsageCoefficient_base,...,annot_status,transcript_id,chromosome,strand,start,end,mean_PPAU_base,mean_PPAU_treatment,delta_PPAU_treatment_control,simple_event_type
0,brown_i3_cortical,ENSG00000021645.20:E001,ENSG00000021645.20,E001,32.627088,0.044220,90.962787,1.463997e-21,2.250563e-19,1.452069,...,novel,PAPA.TDP-4.18197.2,chr14,+,7.829867e+07,7.830620e+07,0.001914,0.033848,0.031934,spliced
1,brown_i3_cortical,ENSG00000021645.20:E002,ENSG00000021645.20,E002,622.521901,0.022117,135.739619,2.274890e-31,5.061630e-29,12.675156,...,annotated,"ENST00000557594.5,ENST00000428277.6,ENST000002...",chr14,+,7.986104e+39,7.986248e+39,0.998086,0.934776,-0.063310,spliced
2,brown_i3_cortical,ENSG00000021645.20:E002,ENSG00000021645.20,E002,622.521901,0.022117,135.739619,2.274890e-31,5.061630e-29,12.675156,...,annotated,"ENST00000557594.5,ENST00000428277.6,ENST000002...",chr14,+,7.986104e+39,7.986248e+39,0.998086,0.934776,-0.063310,spliced
3,brown_i3_cortical,ENSG00000048649.14:E001,ENSG00000048649.14,E001,163.262346,0.037855,88.319669,5.568698e-21,8.407740e-19,4.515264,...,novel,"PAPA.doxconc_DOX_0075_2.11065.7,PAPA.TDP43-G_S...",chr11,-,7.781315e+71,7.781379e+71,0.090158,0.462779,0.372622,spliced
4,brown_i3_cortical,ENSG00000048649.14:E001,ENSG00000048649.14,E001,163.262346,0.037855,88.319669,5.568698e-21,8.407740e-19,4.515264,...,annotated,"PAPA.doxconc_DOX_0075_2.11065.7,PAPA.TDP43-G_S...",chr11,-,7.781315e+71,7.781379e+71,0.090158,0.462779,0.372622,spliced
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209479,zanovello_skndz_curve_1,ENSG00000280789.2:E002,ENSG00000280789.2,E002,99.077638,12.000000,0.006587,9.353145e-01,9.785953e-01,5.982423,...,novel,PAPA.chx_tdp_CTRL_ctrl_4.15153.2,chr16,+,2.981955e+07,2.982252e+07,0.181220,0.333333,0.152113,distal_3utr_extension
209480,zanovello_skndz_curve_1,ENSG00000280832.2:E001,ENSG00000280832.2,E001,75.778902,0.099478,0.075875,7.829679e-01,9.189656e-01,6.014731,...,annotated,ENST00000626810.1,chr11,-,1.263502e+08,1.263518e+08,0.133144,0.147606,0.014462,spliced
209481,zanovello_skndz_curve_1,ENSG00000280832.2:E002,ENSG00000280832.2,E002,402.882205,0.085732,0.087526,7.673463e-01,9.134343e-01,12.145436,...,annotated,ENST00000629441.2,chr11,-,1.263409e+08,1.263423e+08,0.866856,0.852394,-0.014462,spliced
209482,zanovello_skndz_curve_1,ENSG00000282508.2:E001,ENSG00000282508.2,E001,43.093715,0.060733,0.017535,8.946533e-01,9.665471e-01,4.668928,...,annotated,ENST00000633109.1,chr19,-,2.112510e+05,2.117150e+05,0.678888,0.582547,-0.096341,spliced


In [35]:
# get le_ids that are novel bleedthroughs or ALEs
# le_id_spliced = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
# le_id_bleed = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
dexseq_df.simple_event_type.value_counts()


simple_event_type
spliced                  130806
bleedthrough              41054
distal_3utr_extension     37225
Name: count, dtype: int64

In [36]:
# extract lists of event types
le_id_spliced = dexseq_df.loc[dexseq_df.simple_event_type == "spliced", "le_id"]
le_id_bleed = dexseq_df.loc[dexseq_df.simple_event_type == "bleedthrough", "le_id"]
le_id_d3utr = dexseq_df.loc[dexseq_df.simple_event_type == "distal_3utr_extension", "le_id"]

print(f"Number of spliced ALE events - {len(set(le_id_spliced))}")
print(f"Number of bleedthrough ALE events - {len(set(le_id_bleed))}")
print(f"Number of 3'UTR-ALE events - {len(set(le_id_d3utr))}")

Number of spliced ALE events - 7978
Number of bleedthrough ALE events - 3565
Number of 3'UTR-ALE events - 3369


In [37]:
# how many le_ids have multiple annotations?
le_id_spliced_bleed = set(le_id_spliced).intersection(set(le_id_bleed))
print(f"Number of le_ids with bleedthrough and spliced annotations - {len(le_id_spliced_bleed)}")

Number of le_ids with bleedthrough and spliced annotations - 986


In [38]:
# remvoe le_ids with multiple annotations from list (to avoid ambiguity about annotations)
le_id_spliced = le_id_spliced[~le_id_spliced.isin(le_id_spliced_bleed)]
le_id_bleed = le_id_bleed[~le_id_bleed.isin(le_id_spliced_bleed)]
print(f"Number of spliced ALE events (after removing category-overlapping events) - {le_id_spliced.nunique()}")
print(f"Number of bleedthrough ALE events (after removing category-overlapping events) - {le_id_bleed.nunique()}")


Number of spliced ALE events (after removing category-overlapping events) - 6992
Number of bleedthrough ALE events (after removing category-overlapping events) - 2579


In [60]:
cryptics_df.simple_event_type.value_counts()

simple_event_type
spliced                               119
distal_3utr_extension                 104
bleedthrough                           21
bleedthrough,spliced                   12
bleedthrough,distal_3utr_extension      3
Name: count, dtype: int64

In [62]:
cryptics_df.gene_name.nunique()

250

In [40]:
cryptics_df.annot_status.value_counts()

annot_status
novel              185
annotated           55
annotated,novel     19
Name: count, dtype: int64

In [41]:
cryptics_df[["annot_status","simple_event_type"]].value_counts(sort=False)

annot_status     simple_event_type                 
annotated        bleedthrough                            3
                 bleedthrough,spliced                    4
                 spliced                                48
annotated,novel  bleedthrough,spliced                    6
                 spliced                                13
novel            bleedthrough                           18
                 bleedthrough,distal_3utr_extension      3
                 bleedthrough,spliced                    2
                 distal_3utr_extension                 104
                 spliced                                58
Name: count, dtype: int64

In [42]:
# pull out cryptic IDs from summary df
le_id_spliced_cryp = set(cryptics_df.loc[cryptics_df["simple_event_type"] == "spliced", "le_id"])
le_id_bleed_cryp = set(cryptics_df.loc[cryptics_df["simple_event_type"] == "bleedthrough", "le_id"])


print(f"Number of cryptic spliced last exons - {len(le_id_spliced_cryp)}")
print(f"Number of cryptic bleedthrough last exons - {len(le_id_bleed_cryp)}")

Number of cryptic spliced last exons - 119
Number of cryptic bleedthrough last exons - 21


In [43]:
# get a set of novel cryptic IDs so can track where they go
le_id_spliced_novel_cryp = set(cryptics_df[(cryptics_df["annot_status"].isin(["annotated,novel", "novel"])) & (cryptics_df["simple_event_type"] == "spliced")].le_id)
le_id_bleed_novel_cryp = set(cryptics_df[(cryptics_df["annot_status"].isin(["annotated,novel", "novel"])) & (cryptics_df["simple_event_type"] == "bleedthrough")].le_id)

print(f"Number of novel cryptic spliced last exons - {len(le_id_spliced_novel_cryp)}")
print(f"Number of novel cryptic bleedthrough last exons - {len(le_id_bleed_novel_cryp)}")


Number of novel cryptic spliced last exons - 71
Number of novel cryptic bleedthrough last exons - 18


In [44]:
# good idea to ensure don't consider spliced LEs that have a 3'UTR extension
# UPDATE now need to select representative proximal site for each distal site, since quantification approach lumps together all annotated proximal sites (and compares them to extension) 
# since le_ids are annotated sequentially, know that immediately succeeding le_number is the partner le_id
le_id_spliced_spl = le_id_spliced.str.split("_", regex=False, expand=True)
le_id_spliced_spl[1] = le_id_spliced_spl[1].astype(int).add(1)
# reconstruct le_id (corresponding to theoretical distal 3'UTR extension le_id)
le_id_spliced_d3utr = le_id_spliced_spl[0].str.cat(le_id_spliced_spl[1].astype(str), sep="_")

# now have theoretical distal 3'UTR le_ids, intersect with actual ids - any overlapping will be removed from downstream analysis
spliced_d3utr_olap = le_id_spliced_d3utr.isin(le_id_d3utr.values)
# print(f"Number of spliced last exons with 3'UTR extension - {spliced_d3utr_olap.sum()}")

print(f"Number of spliced ALE events (before removing those with a novel extended 3'UTR) - {len(set(le_id_spliced))}")
# now use mask to remove these ALEs
le_id_spliced_d3utr_olap = le_id_spliced[spliced_d3utr_olap]
le_id_spliced = le_id_spliced[~spliced_d3utr_olap]


print(f"Number of spliced ALE events (after removing those with a novel extended 3'UTR) - {len(set(le_id_spliced))}")


Number of spliced ALE events (before removing those with a novel extended 3'UTR) - 6992
Number of spliced ALE events (after removing those with a novel extended 3'UTR) - 4189


In [45]:
# add le_id used in downstream analysis to original df of novel last exons
novel_le = novel_le.apply(lambda df: df.merge(tx2le, on="transcript_id", how="inner", suffixes=[None, "_quant"]))


In [46]:
# Double check that all novel cryptic LEs are retained
print(f"Fraction of novel cryptic last exons that are retained in the input reference of novel last exons - {len(set(novel_le.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant)) / len(le_id_spliced_novel_cryp)}")

Fraction of novel cryptic last exons that are retained in the input reference of novel last exons - 1.0


In [47]:
# Select representative PAS for bleedthroughs and spliced events (novel)
novel_le_rep_spliced, rep_choices_spliced = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))), id_col="le_id_quant")


print(f"Number of novel spliced intervals before selecting representative LEs - {len(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))))}")
print(f"Number of novel spliced intervals after selecting representative LEs - {len(novel_le_rep_spliced)}")
print({dec: len(ids) for dec, ids in rep_choices_spliced.items()})




Number of novel spliced intervals before selecting representative LEs - 4500
Number of novel spliced intervals after selecting representative LEs - 1053
{'atlas_1_pred': 460, 'atlas_max_datasets': 106, 'atlas_max_datasets_shortest': 79, 'motif_1_min': 301, 'motif_shortest_min': 84}


In [48]:
# see if any missing ids
missing_novel_cryp_spl = le_id_spliced_novel_cryp - set(novel_le_rep_spliced.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant)
print(len(missing_novel_cryp_spl))
print(missing_novel_cryp_spl)


3
{'ENSG00000184441.4_1', 'ENSG00000197837.4_1', 'ENSG00000247572.9_4'}


In [49]:
# what are the three missing IDs?
cryptics_df.loc[cryptics_df.le_id.isin(missing_novel_cryp_spl), ["le_id", "gene_name", "simple_event_type", "annot_status"]]

Unnamed: 0,le_id,gene_name,simple_event_type,annot_status
230,ENSG00000184441.4_1,ENSG00000184441,spliced,"annotated,novel"
238,ENSG00000197837.4_1,H4-16,spliced,"annotated,novel"
251,ENSG00000247572.9_4,CKMT2-AS1,spliced,"annotated,novel"


In [50]:
# were they included in the input set of novel last exons provided to function?
missing_novel_cryp_spl.difference(le_id_spliced)

{'ENSG00000184441.4_1', 'ENSG00000197837.4_1', 'ENSG00000247572.9_4'}

In [51]:
# were they excluded because the same last exon has a novel distal 3'UTR extension?
missing_novel_cryp_spl.intersection(le_id_spliced_d3utr_olap)

{'ENSG00000184441.4_1', 'ENSG00000197837.4_1', 'ENSG00000247572.9_4'}

In [53]:
# were they excuded because also annotated as bleedthrough?
missing_novel_cryp_spl.intersection(le_id_spliced_bleed)

set()

In [54]:
# were they in le_id_spliced intially (which was just pulled from dexseq_df)
dexseq_df.loc[dexseq_df.le_id.isin(missing_novel_cryp_spl), ["le_id", "gene_name", "simple_event_type"]].drop_duplicates()

Unnamed: 0,le_id,gene_name,simple_event_type
797,ENSG00000247572.9_4,CKMT2-AS1,spliced
820,ENSG00000184441.4_1,ENSG00000184441,spliced
3002,ENSG00000197837.4_1,H4-16,spliced


Know not present in le_id_spliced
- not because wasn't in df to begin with
- not because also annotated as bleedthrough
- because also has an novel 3'UTR extension at the locus

These events appear to be occurring at very complex loci, so I think it's reasonable not to include them for these maps. Should check IGV though


In [55]:
# Select representative PAS for bleedthroughs and spliced events
novel_le_rep_bleed, rep_choices_bleed = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_bleed))), id_col="le_id_quant")

print(f"Number of bleedthrough intervals before selecting representative LEs - {len(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_bleed))))}")
print(f"Number of bleedthrough intervals after selecting representative LEs - {len(novel_le_rep_bleed)}")
print({dec: len(ids) for dec, ids in rep_choices_bleed.items()})

Number of bleedthrough intervals before selecting representative LEs - 3358
Number of bleedthrough intervals after selecting representative LEs - 917
{'atlas_1_pred': 414, 'atlas_max_datasets': 94, 'atlas_max_datasets_shortest': 74, 'motif_1_min': 315, 'motif_shortest_min': 14}


In [56]:
# see if any missing ids
missing_cryp_bleed = le_id_bleed_novel_cryp - set(novel_le_rep_bleed.subset(lambda df: df.le_id_quant.isin(le_id_bleed_novel_cryp)).le_id_quant)
len(missing_cryp_bleed)

0

In [57]:
# note this number is lower than the 21 cryptic bleedthrough events I identified... Hopefully that is just because the other 3 are annotated
len(le_id_bleed_novel_cryp)

18

In [58]:
# now want to generate an ID with minimal info required for making maps
# 3'end coordinates/last exon, gene name, le_id, site type, regulation status
# site_type - spliced/bleedthrough
# regulation_status - cryptic/background


# define cryptic le_ids
# define cryptic gene_ids
cryp_le_ids = set(cryptics_df.le_id)
cryp_gene_names = set(cryptics_df.gene_name)


# define cryptic & background le_ids
# background - ns in any dataset (to do this, get list of sig in any dataset)
# get gene IDs with no regulated ALEs in any dataset
# returns pd.Series (index = groupIDs)
ns_gene_ids = (dexseq_df.assign(reg_status=lambda df: np.where(df["padj"].le(0.05), 1, 0))
               .groupby("groupID")
               ["reg_status"]
               .sum()
               .loc[lambda x: x == 0]
             )

# set of le_ids that belong to genes with no significant ALEs (is this too conservative?)
ns_le_ids = set(dexseq_df.loc[dexseq_df["groupID"].isin(ns_gene_ids.index), "le_id"])


print(f"Number of cryptic ALE-containing genes - {len(cryp_gene_names)}")
print(f"Number of cryptic ALE isoforms - {len(cryp_gene_names)}")

print(f"Number of ns ALE-containing genes - {len(set(ns_gene_ids.index))}")
print(f"Number of ns ALE isoforms - {len(set(ns_le_ids))}")


Number of cryptic ALE-containing genes - 250
Number of cryptic ALE isoforms - 250
Number of ns ALE-containing genes - 2938
Number of ns ALE isoforms - 6367


In [99]:
# Alternative definition of background (more conservative)
# not significant in any SH-SY5Y dataset, but expressed/evaluated in at least one of them (so know is multi-ALE in at least one dataset)
# NB: extra-check that expressed/evaluated in all 3? Then confident that is a multi-ALE gene robustly expressed/detected
shsy5y_datasets = ["zanovello_shsy5y_curve_0075", "zanovello_shsy5y_chx_kd_ctl_vs_ctl_ctl", "brown_shsy5y"]
dexseq_df_shsy5y = dexseq_df.loc[dexseq_df.experiment_name.isin(shsy5y_datasets), :]

# first get an experiment count for each event
gene_n_shsy5y_expressed = (dexseq_df_shsy5y
.groupby("groupID")
["experiment_name"]
.nunique()
)

print("Gene counts for the number of SH-SY5Y datasets in which genes are expressed")
print(gene_n_shsy5y_expressed.value_counts())

# get genes expressed in all SH-SY5Y datasets but with no sig change in any of them 
ns_gene_ids_shsy5y = (dexseq_df_shsy5y.loc[dexseq_df_shsy5y.groupID.isin(set(gene_n_shsy5y_expressed[gene_n_shsy5y_expressed == len(shsy5y_datasets)].index)), :]
.assign(reg_status=lambda df: np.where(df["padj"].le(0.05), 1, 0))
               .groupby("groupID")
               ["reg_status"]
               .sum()
               .loc[lambda x: x == 0]
)

# set of
ns_le_ids_shsy5y = set(dexseq_df_shsy5y.loc[dexseq_df_shsy5y["groupID"].isin(ns_gene_ids_shsy5y.index), "le_id"])


print(f"Number of ns ALE-containing genes expressed in all SH-SY5Y datasets - {len(ns_gene_ids_shsy5y.index)}")
print(f"Number of ns ALE isoforms of genes expressed in all SH-SY5Y datasets - {len(set(ns_le_ids_shsy5y))}")

# le_ids in genes with no differential usage in any dataset that are also expressed/assessed in all datasets
ns_le_ids_shsy5y_all = set(dexseq_df_shsy5y[dexseq_df_shsy5y.le_id.isin(ns_le_ids_shsy5y)]
.groupby("le_id")
["experiment_name"]
.nunique()
.loc[lambda x: x == 3]
.index
)

print("Of the genes containing ALE isoforms, how many datasets are the individual isoforms expressed in?")
print(dexseq_df_shsy5y[dexseq_df_shsy5y.le_id.isin(ns_le_ids_shsy5y)]
.groupby("le_id")
["experiment_name"]
.nunique()
.value_counts()
)

print(f"Number of ns ALE isoforms expressed in all SH-SY5Y datasets - {len(ns_le_ids_shsy5y_all)}")
# (dexseq_df.assign(reg_status=lambda df: np.where(df["padj"].le(0.05), 1, 0))
#                .groupby("groupID")
#                ["reg_status"]
#                .sum()
#                
#              )

# dexseq_df.experiment_name.drop_duplicates()

Gene counts for the number of SH-SY5Y datasets in which genes are expressed
experiment_name
3    2582
1    1068
2     704
Name: count, dtype: int64
Number of ns ALE-containing genes expressed in all SH-SY5Y datasets - 1577
Number of ns ALE isoforms of genes expressed in all SH-SY5Y datasets - 3624
Of the genes containing ALE isoforms, how many datasets are the individual isoforms expressed in?
experiment_name
3    3304
1     185
2     135
Name: count, dtype: int64
Number of ns ALE isoforms expressed in all SH-SY5Y datasets - 3304


In [108]:
# assign cryptic status for novel bleedthroughs, using different SHSY5y genes as background
# novel_le_rep_bleed_shsy5y = novel_le_rep_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids_shsy5y, id_col="le_id_quant"))

# use le_id_bleed_novel_cryp - novel to annotation cryptic bleedthrough last exons
novel_le_rep_bleed_shsy5y_all = novel_le_rep_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, le_id_bleed_novel_cryp, ns_le_ids_shsy5y_all, id_col="le_id_quant"))
print("Number of events using isoforms expressed in all SH-SY5Y datasets")
print(novel_le_rep_bleed_shsy5y_all.cryptic_status.value_counts())


# assign cryptic status for novel bleedthroughs, using standard ns as background (ns in all datasets)
novel_le_rep_bleed = novel_le_rep_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, le_id_bleed_novel_cryp, ns_le_ids, id_col="le_id_quant"))
novel_le_rep_bleed.cryptic_status.value_counts()


Number of events using isoforms expressed in all SH-SY5Y datasets
cryptic_status
NULL          690
background    209
cryptic        18
Name: count, dtype: int64


cryptic_status
NULL          641
background    258
cryptic        18
Name: count, dtype: int64

In [109]:
# assign cryptic status for novel spliced events, using isoforms expressed in all SH-SY5Y experiments but unchanged in all as background
novel_le_rep_spliced_shsy5y_all = novel_le_rep_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, le_id_spliced_novel_cryp, ns_le_ids_shsy5y_all, id_col="le_id_quant"))
print("Number of events using isoforms expressed in all SH-SY5Y datasets")
print(novel_le_rep_spliced_shsy5y_all.cryptic_status.value_counts())


# assign cryptic status for novel spliced events
novel_le_rep_spliced = novel_le_rep_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, le_id_spliced_novel_cryp, ns_le_ids, id_col="le_id_quant"))
novel_le_rep_spliced.as_df().drop_duplicates(subset="le_id_quant").cryptic_status.value_counts()

Number of events using isoforms expressed in all SH-SY5Y datasets
cryptic_status
NULL          736
background    249
cryptic        68
Name: count, dtype: int64


cryptic_status
NULL          693
background    269
cryptic        68
Name: count, dtype: int64

In [155]:
def construct_bed_name(gr: pr.PyRanges, site_type: str, gene_name_col: str = "gene_name_ref", le_id_col: str = "le_id_quant"):
     '''construct name field of output file - le_id|gene_name|site_type|cryptic_status

     Parameters
     ----------
     gr : pr.PyRanges
         _description_
     site_type : str
         _description_

     Returns
     -------
     _type_
         _description_
     '''
     # remove genes not assigned to cryptic/background
     gr = gr.subset(lambda df: df["cryptic_status"].ne("NULL"))
    
     # some ref_gene_name entries are duplicated - collapse as appropriate
     gr = gr.assign(gene_name_col + "_tmp",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df[gene_name_col].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )

     # assign temp col of category of event
     gr = gr.assign("site_type", lambda df: pd.Series([site_type]*len(df.index), index=df.index))
    
    # Assign final Name field
     gr = (gr.assign("Name",
                    lambda df: df[le_id_col].str.cat(df[[gene_name_col + "_tmp", "site_type", "cryptic_status"]], sep="|"))
                    .drop(["site_type", gene_name_col + "_tmp"])
                    )

     return gr

In [118]:
# Construct 'name' string with minimal annotation information
# le_id|gene_name|site_type|cryptic_status
novel_le_rep_bleed = construct_bed_name(novel_le_rep_bleed, "bleedthrough")
novel_le_rep_bleed_shsy5y_all = construct_bed_name(novel_le_rep_bleed_shsy5y_all, "bleedthrough")
novel_le_rep_spliced = construct_bed_name(novel_le_rep_spliced, "spliced")
novel_le_rep_spliced_shsy5y_all = construct_bed_name(novel_le_rep_spliced_shsy5y_all, "spliced")

In [None]:
# # Construct 'name' string with minimal annotation information
# # le_id|gene_name|site_type|cryptic_status

# novel_le_rep_spliced = (novel_le_rep_spliced
#  .subset(lambda df: df["cryptic_status"].ne("NULL"))
#  .assign("gene_name_ref",
#          # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
#          # some ref_gene_name entries have multiple gene names
#          lambda df: df["gene_name_ref"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
#          )
#          .assign("site_type",
#                  lambda df: pd.Series(["spliced"]*len(df.index), index=df.index))
#  .assign("Name",
#                           lambda df: df["le_id_quant"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
                    
# )

# novel_le_rep_bleed = (novel_le_rep_bleed
#  .subset(lambda df: df["cryptic_status"].ne("NULL"))
#  .assign("gene_name_ref",
#          # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
#          # some ref_gene_name entries have multiple gene names
#          lambda df: df["gene_name_ref"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
#          )
#          .assign("site_type",
#                  lambda df: pd.Series(["bleedthrough"]*len(df.index), index=df.index))
#  .assign("Name",
#                           lambda df: df["le_id_quant"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
                    
# )

In [None]:
# novel_le_rep_spliced.subset(lambda df: df.cryptic_status == "cryptic")[["le_id","Name"]]

In [119]:
# for spliced events, want to report the last exon coordinates (also split into 5'end & 3'end)
# for this purpose, current coordiantes (from input last exons) is sufficient.
# but for bleedthrough events, this will contain the complete last exon. So need to return to quant last exons to get the unique regions. 

# novel_le_rep_bleed - extract transcript_id (i.e. represents selected isoform) & downstream le_id (le_id_quant)
novel_rep_bleed_tx2le = novel_le_rep_bleed.as_df()[["transcript_id", "le_id_quant", "Name"]].rename(columns={"le_id_quant": "le_id"})
novel_rep_bleed_shsy5y_all_tx2le = novel_le_rep_bleed_shsy5y_all.as_df()[["transcript_id", "le_id_quant", "Name"]].rename(columns={"le_id_quant": "le_id"})
novel_rep_bleed_tx2le



Unnamed: 0,transcript_id,le_id,Name
0,PAPA.CTRL-6.141.2,ENSG00000041988.16_2,ENSG00000041988.16_2|THAP3|bleedthrough|backgr...
1,PAPA.DZ_curves_0_2.2152.1,ENSG00000054282.16_2,ENSG00000054282.16_2|SDCCAG8|bleedthrough|back...
2,PAPA.doxconc_DOX_0075_1.1061.3,ENSG00000116830.12_2,ENSG00000116830.12_2|TTF2|bleedthrough|background
3,PAPA.doxconc_NT_0_3.354.1,ENSG00000117682.17_6,ENSG00000117682.17_6|DHDDS|bleedthrough|backgr...
4,PAPA.NT_19074719_S22.1162.5,ENSG00000122481.17_2,ENSG00000122481.17_2|RWDD3|bleedthrough|backgr...
...,...,...,...
271,PAPA.chx_tdp_DOX_ctrl_1.20539.9,ENSG00000147255.19_1,ENSG00000147255.19_1|IGSF1|bleedthrough|backgr...
272,PAPA.TDP43-G_S7.20350.2,ENSG00000180182.11_3,ENSG00000180182.11_3|MED14|bleedthrough|cryptic
273,PAPA.chx_tdp_DOX_ctrl_4.20438.1,ENSG00000181544.16_2,ENSG00000181544.16_2|FANCB|bleedthrough|backgr...
274,PAPA.doxconc_DOX_0075_3.18826.2,ENSG00000183943.6_2,ENSG00000183943.6_2|PRKX|bleedthrough|background


In [141]:
# subset to representative bleedthrough txs, joining in Name information
# 5'coord = 1st cooridnate of intron (i.e. where spliced occurs)
# 3'coord = final coordinate of last exon
novel_le_rep_bleed_quant = quant_uniq_le.apply(lambda df: df.merge(novel_rep_bleed_tx2le, on=["le_id"], how="inner"))
novel_le_rep_bleed_shsy5y_all_quant = quant_uniq_le.apply(lambda df: df.merge(novel_rep_bleed_shsy5y_all_tx2le, on=["le_id"], how="inner"))

# double check number of le_ids remains the same
print(f"Liberal background - Number of le_ids in full last exons - {novel_le_rep_bleed.le_id_quant.nunique()} - vs the quantification truncated last exons - {novel_le_rep_bleed_quant.le_id.nunique()}")
print(f"SH-SY5Y background - Number of le_ids in full last exons - {novel_le_rep_bleed_shsy5y_all.le_id_quant.nunique()} - vs the quantification truncated last exons - {novel_le_rep_bleed_shsy5y_all_quant.le_id.nunique()}")

# novel_le_rep_bleed_quant[["Score", "Name", "transcript_id", "le_id"]].drop_duplicate_positions()


Liberal background - Number of le_ids in full last exons - 275 - vs the quantification truncated last exons - 275
SH-SY5Y background - Number of le_ids in full last exons - 226 - vs the quantification truncated last exons - 226


### Repeat for annotated

In [143]:
# get le_ids in which all transcripts are annotated
annotated_le_ids = dexseq_df.groupby("le_id").filter(lambda df: (df.annot_status == "annotated").all()).le_id
le_id_bleed_annot = set(le_id_bleed).intersection(set(annotated_le_ids))
le_id_spliced_annot = set(le_id_spliced).intersection(set(annotated_le_ids))
print(f"Number of annotated bleedthrough last exons (assessed by DEXSeq) - {len(le_id_bleed_annot)}")
print(f"Number of annotated spliced last exons (assessed by DEXSeq) - {len(le_id_spliced_annot)}")


Number of annotated bleedthrough last exons (assessed by DEXSeq) - 1668
Number of annotated spliced last exons (assessed by DEXSeq) - 3159


In [144]:
# subset quantification GTF for spliecd last exons 
ref_le_spliced = quant_uniq_le.subset(lambda df: df.le_id.isin(le_id_spliced_annot))
# select representative isoform for each last exon ID (most distal 3'end)
ref_le_spliced = select_rep_prox_site(ref_le_spliced, "le_id")

# subset quantification GTF for bleedthrough last exons 
ref_le_bleed = quant_uniq_le.subset(lambda df: df.le_id.isin(le_id_bleed_annot))
# select representative isoform for each last exon ID (most distal 3'end)
ref_le_bleed = select_rep_prox_site(ref_le_bleed, "le_id")

In [150]:
# assign cryptic status for annotatesd spliced last exons
ref_le_spliced_shsy5y_all = ref_le_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids_shsy5y_all, id_col="le_id"))
print(ref_le_spliced_shsy5y_all.cryptic_status.value_counts())


ref_le_spliced = ref_le_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id"))
ref_le_spliced.cryptic_status.value_counts()


cryptic_status
NULL          3019
background     837
cryptic         35
Name: count, dtype: int64


cryptic_status
NULL          2938
background     918
cryptic         35
Name: count, dtype: int64

In [151]:
# assign cryptic status for annotatesd bleedthrough last exons
ref_le_bleed_shsy5y_all = ref_le_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids_shsy5y_all, id_col="le_id"))
print(ref_le_bleed_shsy5y_all.cryptic_status.value_counts())

ref_le_bleed = ref_le_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id"))
ref_le_bleed.cryptic_status.value_counts()

cryptic_status
NULL          1416
background     359
cryptic          3
Name: count, dtype: int64


cryptic_status
NULL          1331
background     444
cryptic          3
Name: count, dtype: int64

In [156]:
# Construct Name field with minimal annotation info

ref_le_bleed = construct_bed_name(ref_le_bleed, "bleedthrough", gene_name_col="ref_gene_name", le_id_col="le_id")
ref_le_bleed_shsy5y_all = construct_bed_name(ref_le_bleed_shsy5y_all, "bleedthrough", gene_name_col="ref_gene_name", le_id_col="le_id")
ref_le_spliced = construct_bed_name(ref_le_spliced, "spliced", gene_name_col="ref_gene_name", le_id_col="le_id")
ref_le_spliced_shsy5y_all = construct_bed_name(ref_le_spliced_shsy5y_all, "spliced", gene_name_col="ref_gene_name", le_id_col="le_id")

In [157]:
# combined output BEDs
spliced_out_bed = pr.concat([novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_spliced[["Score", "Name"]].drop_duplicate_positions()])

spliced_out_bed_shsy5y_all = pr.concat([novel_le_rep_spliced_shsy5y_all[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_spliced_shsy5y_all[["Score", "Name"]].drop_duplicate_positions()])

# ...
bleed_out_bed = pr.concat([novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_bleed[["Score", "Name"]].drop_duplicate_positions()])

bleed_out_bed_shsy5y_all = pr.concat([novel_le_rep_bleed_shsy5y_all_quant[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_bleed_shsy5y_all[["Score", "Name"]].drop_duplicate_positions()])


In [165]:
def check_cryptics(gr, cryptic_le_ids: set):
    '''return set of cryptic IDs NOT found in gr

    Parameters
    ----------
    gr : _type_
        _description_
    '''
    # temporarily create le_id
    gr_le_id = set(gr.Name.str.split("|", expand=True)[0])

    return cryptic_le_ids.difference(gr_le_id)

# double check if all cryptics retained
print(f"Number of spliced cryptic IDs not found in std spliced BED file - {len(check_cryptics(spliced_out_bed, event_type_le_ids['spliced']))}")
print(f"Number of spliced cryptic IDs not found in shsy5y only spliced BED file - {len(check_cryptics(spliced_out_bed_shsy5y_all, event_type_le_ids['spliced']))}")
print(f"Number of bleedthrough cryptic IDs not found in std bleedthrough BED file - {len(check_cryptics(bleed_out_bed, event_type_le_ids['bleedthrough']))}")
print(f"Number of bleedthrough cryptic IDs not found in shsy5y only bleedthrough BED file - {len(check_cryptics(bleed_out_bed_shsy5y_all, event_type_le_ids['bleedthrough']))}")

Number of spliced cryptic IDs not found in std spliced BED file - 23
Number of spliced cryptic IDs not found in shsy5y only spliced BED file - 23
Number of bleedthrough cryptic IDs not found in std bleedthrough BED file - 0
Number of bleedthrough cryptic IDs not found in shsy5y only bleedthrough BED file - 0


In [None]:
# combined output BEDs
spliced_out_bed = pr.concat([novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_spliced[["Score", "Name"]].drop_duplicate_positions()])

bleed_out_bed = pr.concat([novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_bleed[["Score", "Name"]].drop_duplicate_positions()])

spliced_out_bed.subset(lambda df: df.Name.str.contains("cryptic$"))

# novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions()
# novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions()


# generate output bed file of proximal and distal PAS
# utr3_out_bed = pr.concat([sel_le_d3utr_prox[["Score", "Name"]],
#                           sel_novel_le_d3utr[["Score", "Name"]]]
#                           )

# print(len(utr3_out_bed))
# # drop duplicate entries
# utr3_out_bed = utr3_out_bed.apply(lambda df: df.drop_duplicates()).sort()

# utr3_out_bed

In [None]:
# which cryptics are missing from each df
missing_cryp_spl_all = le_id_spliced_cryp.difference(set(spliced_out_bed.subset(lambda df: df.Name.str.contains("cryptic$")).Name.str.split("|", expand=True)[0]))
print(f"fraction of cryptic spliced events retained in output - {len(le_id_spliced_cryp.intersection(set(spliced_out_bed.subset(lambda df: df.Name.str.contains('cryptic$')).Name.str.split('|', expand=True)[0]))) / len(le_id_spliced_cryp)}")
print(f"number of missing ids - {len(missing_cryp_spl_all)}")

In [None]:
# how many missing spliced events are novel?
missing_cryp_spl_all_novel = set(novel_le.subset(lambda df: df.le_id_quant.isin(missing_cryp_spl_all)).le_id_quant)
print(len(missing_cryp_spl_all_novel))
novel_le.subset(lambda df: df.le_id_quant.isin(missing_cryp_spl_all_novel))


In [None]:
# how many missing cryptic spliced events? partners to 3'UTR extensions
le_id_d3utr_spliced_prtnr = set(dexseq_df.loc[dexseq_df.simple_event_type == "spliced", "le_id"][spliced_d3utr_olap].drop_duplicates())
print(len(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr)))
",".join(dexseq_df[dexseq_df.le_id.isin(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr))].gene_name.drop_duplicates())

In [None]:
# above 23 are probs better off not including in spliced ( & rather as cryptic 3'UTR changes, as more likely to be reg at site level rather than SJ)
# of the 3 novel, are they partners to 3'UTR extensions? (NB: how?)
missing_cryp_spl_all_novel.intersection(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr))

In [None]:
# of the 34, are the remaining events all annotated?
missing_cryp_spl_all_n_d3utr_prtnr = missing_cryp_spl_all.difference(missing_cryp_spl_all.intersection(le_id_d3utr_spliced_prtnr))
print(len(missing_cryp_spl_all_n_d3utr_prtnr))
missing_cryp_spl_all_n_d3utr_prtnr.difference(set(ref_le_spliced.le_id))

In [None]:
dexseq_df[dexseq_df.le_id.isin(missing_cryp_spl_all_n_d3utr_prtnr)][["le_id", "gene_name", "annot_status", "event_type", "simple_event_type"]].drop_duplicates()

In [None]:
spliced_out_bed.subset(lambda df: df.Name.str.contains("STMN2", regex=False))

In [None]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_spliced.items()])

In [None]:
spliced_dupe = [le_id.split("|")[0] for  le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index 
    if le_id.split("|")[0] in set(rep_choices_spliced["atlas_1_pred"]).union(set(rep_choices_spliced["atlas_max_datasets"]))
    ]

spliced_dupe

In [None]:
novel_le_rep_spliced.subset(lambda df: df.le_id_quant.isin(spliced_dupe))[["le_id_quant", "event_type", "site_type", "cryptic_status"]]

In [None]:
# all occur where 3' coordinate is the same, but differ in their 5'coordinates
# 1 - keep all possible 5' & 3' coordinates?
# 2 - keep the shortest one (similar to how I have filtered previously)

In [None]:
bleed_out_bed.subset(lambda df: df.Name.str.contains("cryptic$"))

In [None]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_bleed.items()])

In [None]:
bleed_dupe = [le_id.split("|")[0] for  le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index 
    if le_id.split("|")[0] in set(rep_choices_bleed["atlas_1_pred"]).union(set(rep_choices_bleed["atlas_max_datasets"]).union(set(rep_choices_bleed["motif_1_min"])))
    ]

bleed_dupe

In [None]:
novel_le_rep_bleed.subset(lambda df: df.le_id_quant.isin(bleed_dupe))[["le_id_quant", "event_type", "site_type", "cryptic_status"]]

In [None]:
# select representative 5'end coord (the shoretest) for a given le_id
bleed_out_bed = select_rep_five_end(bleed_out_bed, id_col = "Name")
spliced_out_bed = select_rep_five_end(spliced_out_bed, id_col = "Name")

In [None]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_bleed.items()])

In [None]:
# how many le_ids have distinct coordinates? which collapsing category do they belong to?
Counter([k if le_id.split("|")[0] in v else None for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_spliced.items()])

In [None]:
[le_id.split("|")[0] for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index if le_id.split("|")[0] in rep_choices_spliced["atlas_1_pred"]]

In [None]:
spliced_out_bed.Name.str.split("|", expand=True)[3].value_counts()

In [None]:
bleed_out_bed.Name.str.split("|", expand=True)[[3]].value_counts()

In [None]:
if not os.path.exists("../processed/iclip_regions"):
    os.makedirs("../processed/iclip_regions")

#### TEMPORARY - work out why duplicate regions/multiple PAS coords for some last exons. To prevent duplciats slect longest per interval
spliced_out_bed = select_rep_prox_site(spliced_out_bed, "Name")
bleed_out_bed = select_rep_prox_site(bleed_out_bed, "Name")

spliced_out_bed.to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_spliced.last_exons.bed")
# also make one with just polYA sites
spliced_out_bed.three_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_spliced.pas.bed")
# also first coordinate of exon (i.e. interval ending at this coordinate = splice site end, or substract 1 (strand-aware) to get final nucleotide of splice site)
spliced_out_bed.five_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_spliced.le_start.bed")

bleed_out_bed.to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_bleedthrough.last_exons.bed")
# also make one with just polYA sites
bleed_out_bed.three_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_bleedthrough.pas.bed")
# also first coordinate of exon (i.e. 1st nucleotide of intron
bleed_out_bed.five_end().to_bed("../processed/iclip_regions/2023-07-04_papa_cryptic_bleedthrough.le_start.bed")