In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from define_helpers import *
import os

In [2]:
# input novel last exons used to generate combined reference of last exons
novel_le = pr.read_gtf("../data/papa/2023-03-29_papa_i3_cortical_upf1_zanovello_overlap_annotated.gtf")
# dexseq results df (used to extract cryptic events of each class)
dexseq_df = pd.read_csv("../data/papa/2023-05-24_i3_cortical_zanovello.all_datasets.dexseq_apa.results.processed.cleaned.tsv", sep="\t")
# refeence GTF used to identify novel last exons, quantify vs ref
ref_gtf = pr.read_gtf("../data/reference_filtered.gtf")
# 
tx2le = pd.read_csv("../data/papa/novel_ref_combined.tx2le.tsv", sep="\t")

# last exon quantification regions used as input to Salmon
quant_uniq_le = pr.read_gtf("../data/papa/novel_ref_combined.quant.last_exons.gtf")


info_df = pd.read_csv("../data/papa/novel_ref_combined.info.tsv", sep="\t")

In [3]:
# get le_ids that are novel bleedthroughs or ALEs
# le_id_spliced = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
# le_id_bleed = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
dexseq_df.simple_event_type.value_counts()


simple_event_type
spliced                  130806
bleedthrough              41453
distal_3utr_extension     37225
Name: count, dtype: int64

In [4]:
le_id_spliced = dexseq_df.loc[dexseq_df.simple_event_type == "spliced", "le_id"]
le_id_bleed = dexseq_df.loc[dexseq_df.simple_event_type == "bleedthrough", "le_id"]
le_id_d3utr = dexseq_df.loc[dexseq_df.simple_event_type == "distal_3utr_extension", "le_id"]


In [5]:
# good idea to ensure don't consider spliced LEs that have a 3'UTR extension
# UPDATE now need to select representative proximal site for each distal site, since quantification approach lumps together all annotated proximal sites (and compares them to extension) 
# since le_ids are annotated sequentially, know that immediately succeeding le_number is the partner le_id
le_id_spliced_spl = le_id_spliced.str.split("_", regex=False, expand=True)
le_id_spliced_spl[1] = le_id_spliced_spl[1].astype(int).add(1)
# reconstruct le_id (corresponding to theoretical distal 3'UTR extension le_id)
le_id_spliced_d3utr = le_id_spliced_spl[0].str.cat(le_id_spliced_spl[1].astype(str), sep="_")

# now have theoretical distal 3'UTR le_ids, intersect with actual ids - any overlapping will be removed from downstream analysis
spliced_d3utr_olap = le_id_spliced_d3utr.isin(le_id_d3utr.values)
print(f"Number of spliced last exons with 3'UTR extension - {spliced_d3utr_olap.sum()}")

# now use mask to remove these ALEs
le_id_spliced = le_id_spliced[~spliced_d3utr_olap]
le_id_spliced


Number of spliced last exons with 3'UTR extension - 58895


0         ENSG00000021645.20_1
1         ENSG00000021645.20_4
2         ENSG00000021645.20_4
3         ENSG00000048649.14_1
4         ENSG00000048649.14_1
                  ...         
209475     ENSG00000278311.5_5
209480     ENSG00000280832.2_1
209481     ENSG00000280832.2_2
209482     ENSG00000282508.2_4
209483     ENSG00000282508.2_6
Name: le_id, Length: 71911, dtype: object

In [6]:
# add le_id used in downstream analysis
novel_le = novel_le.apply(lambda df: df.merge(tx2le, on="transcript_id", how="inner", suffixes=[None, "_quant"]))


# Select representative PAS for bleedthroughs and spliced events
novel_le_rep_spliced, rep_choices_spliced = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))),id_col="le_id_quant")

print({dec: len(ids) for dec, ids in rep_choices_spliced.items()})


{'atlas_1_pred': 571, 'atlas_max_datasets': 143, 'atlas_max_datasets_shortest': 98, 'motif_1_min': 321, 'motif_shortest_min': 160}
