In [42]:
import pyranges as pr
import pandas as pd
import numpy as np
from define_helpers import select_rep_site, _define_cryptic_status, select_rep_prox_site
from collections import Counter
import os

In [2]:
# input novel last exons used to generate combined reference of last exons
novel_le = pr.read_gtf("../data/papa/2023-03-29_papa_i3_cortical_upf1_zanovello_overlap_annotated.gtf")
# dexseq results df (used to extract cryptic events of each class)
dexseq_df = pd.read_csv("../data/papa/2023-05-24_i3_cortical_zanovello.all_datasets.dexseq_apa.results.processed.cleaned.tsv", sep="\t")
cryptics_df = pd.read_csv("../../preprocessing/processed/cryptics_summary_all_events.tsv", sep="\t")

# refeence GTF used to identify novel last exons, quantify vs ref
ref_gtf = pr.read_gtf("../data/reference_filtered.gtf")
# 
tx2le = pd.read_csv("../data/papa/novel_ref_combined.tx2le.tsv", sep="\t")

# last exon quantification regions used as input to Salmon
quant_uniq_le = pr.read_gtf("../data/papa/novel_ref_combined.quant.last_exons.gtf")


info_df = pd.read_csv("../data/papa/novel_ref_combined.info.tsv", sep="\t")

In [3]:
# get le_ids that are novel bleedthroughs or ALEs
# le_id_spliced = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
# le_id_bleed = set(dexseq_df[dexseq_df.simple_event_type == "distal_3utr_extension"].le_id)
dexseq_df.simple_event_type.value_counts()


simple_event_type
spliced                  130806
bleedthrough              41453
distal_3utr_extension     37225
Name: count, dtype: int64

In [4]:
le_id_spliced = dexseq_df.loc[dexseq_df.simple_event_type == "spliced", "le_id"]
le_id_bleed = dexseq_df.loc[dexseq_df.simple_event_type == "bleedthrough", "le_id"]
le_id_d3utr = dexseq_df.loc[dexseq_df.simple_event_type == "distal_3utr_extension", "le_id"]


In [5]:
print(f"Number of spliced ALE events - {len(set(le_id_spliced))}")
print(f"Number of bleedthrough ALE events - {len(set(le_id_bleed))}")
print(f"Number of 3'UTR-ALE events - {len(set(le_id_d3utr))}")

Number of spliced ALE events - 7978
Number of bleedthrough ALE events - 3599
Number of 3'UTR-ALE events - 3369


In [6]:
cryptics_df.simple_event_type.value_counts()

simple_event_type
spliced                               119
distal_3utr_extension                 104
bleedthrough                           55
bleedthrough,spliced                   12
bleedthrough,distal_3utr_extension      3
Name: count, dtype: int64

In [7]:
cryptics_df.annot_status.value_counts()

annot_status
novel              204
annotated           70
annotated,novel     19
Name: count, dtype: int64

In [8]:
cryptics_df[["annot_status","simple_event_type"]].value_counts(sort=False)

annot_status     simple_event_type                 
annotated        bleedthrough                           18
                 bleedthrough,spliced                    4
                 spliced                                48
annotated,novel  bleedthrough,spliced                    6
                 spliced                                13
novel            bleedthrough                           37
                 bleedthrough,distal_3utr_extension      3
                 bleedthrough,spliced                    2
                 distal_3utr_extension                 104
                 spliced                                58
Name: count, dtype: int64

In [9]:
# get a set of novel cryptic IDs so can track where they go
le_id_spliced_novel_cryp = set(cryptics_df[(cryptics_df["annot_status"] == "novel") & (cryptics_df["simple_event_type"] == "spliced")].le_id)
le_id_bleed_novel_cryp = set(cryptics_df[(cryptics_df["annot_status"] == "novel") & (cryptics_df["simple_event_type"] == "bleedthrough")].le_id)



In [10]:
# good idea to ensure don't consider spliced LEs that have a 3'UTR extension
# UPDATE now need to select representative proximal site for each distal site, since quantification approach lumps together all annotated proximal sites (and compares them to extension) 
# since le_ids are annotated sequentially, know that immediately succeeding le_number is the partner le_id
le_id_spliced_spl = le_id_spliced.str.split("_", regex=False, expand=True)
le_id_spliced_spl[1] = le_id_spliced_spl[1].astype(int).add(1)
# reconstruct le_id (corresponding to theoretical distal 3'UTR extension le_id)
le_id_spliced_d3utr = le_id_spliced_spl[0].str.cat(le_id_spliced_spl[1].astype(str), sep="_")

# now have theoretical distal 3'UTR le_ids, intersect with actual ids - any overlapping will be removed from downstream analysis
spliced_d3utr_olap = le_id_spliced_d3utr.isin(le_id_d3utr.values)
# print(f"Number of spliced last exons with 3'UTR extension - {spliced_d3utr_olap.sum()}")

# now use mask to remove these ALEs
le_id_spliced = le_id_spliced[~spliced_d3utr_olap]
print(f"Number of spliced ALE events (after removing those with a novel extended 3'UTR) - {len(set(le_id_spliced))}")


Number of spliced ALE events (after removing those with a novel extended 3'UTR) - 4854


In [11]:
# add le_id used in downstream analysis
novel_le = novel_le.apply(lambda df: df.merge(tx2le, on="transcript_id", how="inner", suffixes=[None, "_quant"]))


In [12]:
len(set(novel_le.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant))

58

In [58]:
novel_le.subset(lambda df: df.gene_name_ref.str.contains("STMN2", regex=False))[["Name"]].as_df().to_string()

'  Chromosome     Start       End Strand          Name\n0       chr8  79616821  79617073      +  8:79617071:+\n1       chr8  79616821  79617049      +  8:79617071:+\n2       chr8  79616821  79617073      +  8:79617071:+\n3       chr8  79616821  79617073      +  8:79617071:+\n4       chr8  79616821  79617071      +  8:79617071:+\n5       chr8  79616821  79617073      +  8:79617071:+\n6       chr8  79616821  79623859      +  8:79623859:+'

In [13]:

# Select representative PAS for bleedthroughs and spliced events (novel)
novel_le_rep_spliced, rep_choices_spliced = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))), id_col="le_id_quant")


print(f"Number of novel spliced intervals before selecting representative LEs - {len(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_spliced))))}")
print(f"Number of novel spliced intervals after selecting representative LEs - {len(novel_le_rep_spliced)}")
print({dec: len(ids) for dec, ids in rep_choices_spliced.items()})




Number of novel spliced intervals before selecting representative LEs - 6209
Number of novel spliced intervals after selecting representative LEs - 1818
{'atlas_1_pred': 571, 'atlas_max_datasets': 143, 'atlas_max_datasets_shortest': 98, 'motif_1_min': 321, 'motif_shortest_min': 160}


In [14]:
# see if any missing ids
missing_cryp_spl = le_id_spliced_novel_cryp - set(novel_le_rep_spliced.subset(lambda df: df.le_id_quant.isin(le_id_spliced_novel_cryp)).le_id_quant)
len(missing_cryp_spl)


0

In [15]:
# Select representative PAS for bleedthroughs and spliced events
novel_le_rep_bleed, rep_choices_bleed = select_rep_site(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_bleed))), id_col="le_id_quant")

print(f"Number of bleedthrough intervals before selecting representative LEs - {len(novel_le.subset(lambda df: df.le_id_quant.isin(set(le_id_bleed))))}")
print(f"Number of bleedthrough intervals after selecting representative LEs - {len(novel_le_rep_bleed)}")
print({dec: len(ids) for dec, ids in rep_choices_bleed.items()})

Number of bleedthrough intervals before selecting representative LEs - 5952
Number of bleedthrough intervals after selecting representative LEs - 1714
{'atlas_1_pred': 590, 'atlas_max_datasets': 152, 'atlas_max_datasets_shortest': 105, 'motif_1_min': 350, 'motif_shortest_min': 114}


In [16]:
# see if any missing ids
missing_cryp_bleed = le_id_bleed_novel_cryp - set(novel_le_rep_bleed.subset(lambda df: df.le_id_quant.isin(le_id_bleed_novel_cryp)).le_id_quant)
len(missing_cryp_bleed)

0

In [17]:
# now want to generate an ID with minimal info required for making maps
# 3'end coordinates/last exon, gene name, le_id, site type, regulation status
# site_type - spliced/bleedthrough
# regulation_status - cryptic/background


# define cryptic le_ids
# define cryptic gene_ids
cryp_le_ids = set(cryptics_df.le_id)
cryp_gene_names = set(cryptics_df.gene_name)


# define cryptic & background le_ids
# background - ns in any dataset (to do this, get list of sig in any dataset)
# get gene IDs with no regulated ALEs in any dataset
# returns pd.Series (index = groupIDs)
ns_gene_ids = (dexseq_df.assign(reg_status=lambda df: np.where(df["padj"].le(0.05), 1, 0))
               .groupby("groupID")
               ["reg_status"]
               .sum()
               .loc[lambda x: x == 0]
             )

# set of le_ids that belong to genes with no significant ALEs (is this too conservative?)
ns_le_ids = set(dexseq_df.loc[dexseq_df["groupID"].isin(ns_gene_ids.index), "le_id"])


print(f"Number of cryptic ALE-containing genes - {len(cryp_gene_names)}")
print(f"Number of cryptic ALE isoforms - {len(cryp_gene_names)}")

print(f"Number of ns ALE-containing genes - {len(set(ns_gene_ids.index))}")
print(f"Number of ns ALE isoforms - {len(set(ns_le_ids))}")


Number of cryptic ALE-containing genes - 283
Number of cryptic ALE isoforms - 283
Number of ns ALE-containing genes - 2937
Number of ns ALE isoforms - 6365


In [18]:
# assign cryptic status for novel bleedthroughs
novel_le_rep_bleed = novel_le_rep_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id_quant"))
novel_le_rep_bleed.cryptic_status.value_counts()


cryptic_status
NULL          1139
background     508
cryptic         67
Name: count, dtype: int64

In [34]:
# assign cryptic status for novel spliced events
novel_le_rep_spliced = novel_le_rep_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id_quant"))
novel_le_rep_spliced.as_df().drop_duplicates(subset="le_id_quant").cryptic_status.value_counts()

cryptic_status
background    404
cryptic        75
Name: count, dtype: int64

In [20]:
# Construct 'name' string with minimal annotation information
# le_id|gene_name|site_type|cryptic_status

novel_le_rep_spliced = (novel_le_rep_spliced
 .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["gene_name_ref"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
         .assign("site_type",
                 lambda df: pd.Series(["spliced"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id_quant"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
                    
)

novel_le_rep_bleed = (novel_le_rep_bleed
 .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["gene_name_ref"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
         .assign("site_type",
                 lambda df: pd.Series(["bleedthrough"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id_quant"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
                    
)

In [21]:
novel_le_rep_spliced.subset(lambda df: df.cryptic_status == "cryptic")[["le_id","Name"]]

Unnamed: 0,Chromosome,Start,End,Strand,le_id,Name
0,chr1,76871267,76871821,+,ENSG00000117069.15_2,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic
1,chr1,61824444,61825501,+,ENSG00000132849.22_1,ENSG00000132849.22_1|PATJ|spliced|cryptic
2,chr1,245464258,245471621,+,ENSG00000162849.16_2,ENSG00000162849.16_2|KIF26B|spliced|cryptic
3,chr1,112540178,112542090,-,ENSG00000007341.19_4,ENSG00000007341.19_4|ST7L|spliced|cryptic
4,chr1,243612605,243613034,-,ENSG00000117020.19_2,ENSG00000117020.19_2|AKT3|spliced|cryptic
...,...,...,...,...,...,...
107,chr21,45792478,45794699,+,ENSG00000183570.17_3,ENSG00000183570.17_3|PCBP3|spliced|cryptic
108,chr22,37464524,37465718,-,ENSG00000100060.18_2,ENSG00000100060.18_2|MFNG|spliced|cryptic
109,chrX,102721363,102724864,+,ENSG00000198908.12_1,ENSG00000198908.12_1|BHLHB9|spliced|cryptic
110,chrX,98679426,98679978,+,ENSG00000281566.3_1,ENSG00000281566.3_1|ENSG00000281566|spliced|cr...


In [22]:
# for spliced events, want to report the last exon coordinates (also split into 5'end & 3'end)
# for this purpose, current coordiantes (from input last exons) is sufficient.
# but for bleedthrough events, this will contain the complete last exon. So need to return to quant last exons to get the unique regions. 

# novel_le_rep_bleed - extract transcript_id (i.e. represents selected isoform) & downstream le_id (le_id_quant)
novel_rep_bleed_tx2le = novel_le_rep_bleed.as_df()[["transcript_id", "le_id_quant", "Name"]].rename(columns={"le_id_quant": "le_id"})
novel_rep_bleed_tx2le



Unnamed: 0,transcript_id,le_id,Name
0,PAPA.TDP-1.2120.4,ENSG00000082497.12_1,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...
1,PAPA.CTRL-2.2002.3,ENSG00000082497.12_1,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...
2,PAPA.ctrl_ctrl_4.2339.3,ENSG00000082497.12_1,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...
3,PAPA.doxconc_DOX_0075_2.1123.3,ENSG00000116128.12_2,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic
4,PAPA.doxconc_DOX_0075_1.1061.3,ENSG00000116830.12_2,ENSG00000116830.12_2|TTF2|bleedthrough|background
...,...,...,...
570,PAPA.TDP43-G_S7.20350.2,ENSG00000180182.11_3,ENSG00000180182.11_3|MED14|bleedthrough|cryptic
571,PAPA.TDP-2.25089.1,ENSG00000182518.14_1,ENSG00000182518.14_1|FAM104B|bleedthrough|back...
572,PAPA.doxconc_NT_0_3.19568.4,ENSG00000196459.15_1,ENSG00000196459.15_1|TRAPPC2|bleedthrough|back...
573,PAPA.Cont-D_S4.20482.2,ENSG00000197021.9_2,ENSG00000197021.9_2|EOLA2|bleedthrough|background


In [23]:
for k,v in rep_choices_bleed.items():
    if "ENSG00000082497.12_1" in v:
        print(k)

atlas_max_datasets


In [24]:
# subset to representative bleedthrough txs, joining in Name information
# 5'coord = 1st cooridnate of intron (i.e. where spliced occurs)
# 3'coord = final coordinate of last exon
novel_le_rep_bleed_quant = quant_uniq_le.apply(lambda df: df.merge(novel_rep_bleed_tx2le, on=["transcript_id", "le_id"], how="inner"))

novel_le_rep_bleed_quant[["Score", "Name", "transcript_id", "le_id"]].drop_duplicate_positions()



Unnamed: 0,Chromosome,Start,End,Score,Strand,Name,transcript_id,le_id
0,chr1,210241557,210245767,.,+,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...,PAPA.TDP-1.2120.4,ENSG00000082497.12_1
1,chr1,210245927,210246847,.,+,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...,PAPA.TDP-1.2120.4,ENSG00000082497.12_1
2,chr1,210245927,210246813,.,+,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...,PAPA.CTRL-2.2002.3,ENSG00000082497.12_1
3,chr1,210245927,210246804,.,+,ENSG00000082497.12_1|SERTAD4|bleedthrough|back...,PAPA.ctrl_ctrl_4.2339.3,ENSG00000082497.12_1
4,chr1,147622531,147623356,.,+,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic,PAPA.doxconc_DOX_0075_2.1123.3,ENSG00000116128.12_2
...,...,...,...,...,...,...,...,...
644,chrX,40653641,40654363,.,-,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,PAPA.TDP43-G_S7.20350.2,ENSG00000180182.11_3
645,chrX,55142867,55146180,.,-,ENSG00000182518.14_1|FAM104B|bleedthrough|back...,PAPA.TDP-2.25089.1,ENSG00000182518.14_1
646,chrX,149931286,149932588,.,-,ENSG00000197021.9_2|EOLA2|bleedthrough|background,PAPA.Cont-D_S4.20482.2,ENSG00000197021.9_2
647,chrX,131823777,131825221,.,-,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,PAPA.TDP43_ctrl_2.26896.10,ENSG00000213468.7_1


### Repeat for annotated

In [25]:
# get le_ids in which all transcripts are annotated
annotated_le_ids = dexseq_df.groupby("le_id").filter(lambda df: (df.annot_status == "annotated").all()).le_id
le_id_bleed_annot = set(le_id_bleed).intersection(set(annotated_le_ids))
le_id_spliced_annot = set(le_id_spliced).intersection(set(annotated_le_ids))
print(f"Number of annotated bleedthrough last exons (assessed by DEXSeq) - {len(le_id_bleed_annot)}")
print(f"Number of annotated spliced last exons (assessed by DEXSeq) - {len(le_id_spliced_annot)}")

Number of annotated bleedthrough last exons (assessed by DEXSeq) - 2288
Number of annotated spliced last exons (assessed by DEXSeq) - 3561


In [26]:
# subset quantification GTF for spliecd last exons 
ref_le_spliced = quant_uniq_le.subset(lambda df: df.le_id.isin(le_id_spliced_annot))
# select representative isoform for each last exon ID (most distal 3'end)
ref_le_spliced = select_rep_prox_site(ref_le_spliced, "le_id")

# subset quantification GTF for bleedthrough last exons 
ref_le_bleed = quant_uniq_le.subset(lambda df: df.le_id.isin(le_id_bleed_annot))
# select representative isoform for each last exon ID (most distal 3'end)
ref_le_bleed = select_rep_prox_site(ref_le_bleed, "le_id")

In [27]:
# assign cryptic status for annotatesd spliced last exons
ref_le_spliced = ref_le_spliced.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id"))
ref_le_spliced.cryptic_status.value_counts()


cryptic_status
NULL          2275
background    2176
cryptic         39
Name: count, dtype: int64

In [28]:
# assign cryptic status for annotatesd bleedthrough last exons
ref_le_bleed = ref_le_bleed.assign("cryptic_status", lambda df: _define_cryptic_status(df, cryp_le_ids, ns_le_ids, id_col="le_id"))
ref_le_bleed.cryptic_status.value_counts()

cryptic_status
background    1382
NULL          1261
cryptic         24
Name: count, dtype: int64

In [29]:
ref_le_spliced[["ref_gene_name"]]

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name
0,chr1,109214546,109215153,+,SARS1
1,chr1,69873466,69875004,+,LRRC7
2,chr1,6634019,6635586,+,THAP3
3,chr1,7779625,7781432,+,VAMP3
4,chr1,243305011,243306548,+,SDCCAG8
...,...,...,...,...,...
4485,chrY,12859278,12860839,+,USP9Y
4486,chrY,18932316,18932841,-,TTTY14
4487,chrY,18872500,18872834,-,TTTY14
4488,chrY,18872500,18872834,-,TTTY14


In [30]:
# assign Name field with minimal info
ref_le_spliced = (ref_le_spliced
  .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["ref_gene_name"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
         .assign("site_type",
                 lambda df: pd.Series(["spliced"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
)

ref_le_bleed = (ref_le_bleed
 .subset(lambda df: df["cryptic_status"].ne("NULL"))
 .assign("gene_name_ref",
         # list(dict.fromkeys(x.split(","))) - drops duplicates whilst preserving order
         # some ref_gene_name entries have multiple gene names
         lambda df: df["ref_gene_name"].apply(lambda x: ",".join(list(dict.fromkeys(x.split(",")))))
         )
.assign("site_type",
                 lambda df: pd.Series(["bleedthrough"]*len(df.index), index=df.index))
 .assign("Name",
                          lambda df: df["le_id"].str.cat(df[["gene_name_ref", "site_type", "cryptic_status"]], sep="|"))
)

In [31]:
# since last_exon_spliced annotations includes most distal last exon of a gene, let's compare these ALEs to non-terminal ALEs (i.e. there is a competing splicign event)
print(ref_le_spliced.event_type.value_counts())

ref_le_spliced = ref_le_spliced.subset(lambda df: df["event_type"].ne("last_exon_spliced"))
ref_le_spliced
      

event_type
last_exon_spliced                           1190
first_exon_spliced                           525
internal_exon_spliced                        247
internal_exon_extension                      106
first_exon_spliced,internal_exon_spliced     106
first_exon_extension                          41
Name: count, dtype: int64


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id,cryptic_status,gene_name_ref,site_type,Name
0,chr1,.,exon,39753286,39756769,.,+,.,ENSG00000084072.17,PPIE,...,,internal_exon_spliced,ENSG00000084072.17,PPIE,2.0,ENSG00000084072.17_2,background,PPIE,spliced,ENSG00000084072.17_2|PPIE|spliced|background
1,chr1,.,exon,39763688,39763914,.,+,.,ENSG00000084072.17,PPIE,...,,internal_exon_spliced,ENSG00000084072.17,PPIE,3.0,ENSG00000084072.17_3,background,PPIE,spliced,ENSG00000084072.17_3|PPIE|spliced|background
2,chr1,.,exon,170736047,170739421,.,+,.,ENSG00000116132.12,PRRX1,...,"NULL,NULL",internal_exon_spliced,ENSG00000116132.12,PRRX1,2.0,ENSG00000116132.12_2,background,PRRX1,spliced,ENSG00000116132.12_2|PRRX1|spliced|background
3,chr1,.,exon,110232597,110249124,.,+,.,ENSG00000116396.15,KCNC4,...,16527.0,internal_exon_extension,ENSG00000116396.15,KCNC4,2.0,ENSG00000116396.15_2,background,KCNC4,spliced,ENSG00000116396.15_2|KCNC4|spliced|background
4,chr1,.,exon,32855923,32858875,.,+,.,ENSG00000116497.18,S100PBP,...,,first_exon_spliced,ENSG00000116497.18,S100PBP,3.0,ENSG00000116497.18_3,background,S100PBP,spliced,ENSG00000116497.18_3|S100PBP|spliced|background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,chrX,.,exon,72274254,72274435,.,-,.,ENSG00000198034.11,RPS4X,...,,internal_exon_spliced,ENSG00000198034.11,RPS4X,1.0,ENSG00000198034.11_1,background,RPS4X,spliced,ENSG00000198034.11_1|RPS4X|spliced|background
1021,chrX,.,exon,130981589,130981856,.,-,.,ENSG00000228659.1,LINC01201,...,,first_exon_spliced,ENSG00000228659.1,LINC01201,3.0,ENSG00000228659.1_3,background,LINC01201,spliced,ENSG00000228659.1_3|LINC01201|spliced|background
1022,chrX,.,exon,149527590,149528005,.,-,.,"ENSG00000241489.8,ENSG00000241769.7",EOLA1-DT,...,"NULL,NULL,NULL",internal_exon_spliced,"ENSG00000241489.8,ENSG00000241769.7",EOLA1-DT,1.0,"ENSG00000241489.8,ENSG00000241769.7_1",background,EOLA1-DT,spliced,"ENSG00000241489.8,ENSG00000241769.7_1|EOLA1-DT..."
1023,chrY,.,exon,18872500,18872834,.,-,.,ENSG00000176728.10,TTTY14,...,,first_exon_spliced,ENSG00000176728.10,TTTY14,3.0,ENSG00000176728.10_3,background,TTTY14,spliced,ENSG00000176728.10_3|TTTY14|spliced|background


In [32]:
# combined output BEDs
spliced_out_bed = pr.concat([novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_spliced[["Score", "Name"]].drop_duplicate_positions()])

bleed_out_bed = pr.concat([novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions(),
                             ref_le_bleed[["Score", "Name"]].drop_duplicate_positions()])

spliced_out_bed.subset(lambda df: df.Name.str.contains("cryptic$"))

# novel_le_rep_bleed_quant[["Score", "Name"]].drop_duplicate_positions()
# novel_le_rep_spliced[["Score", "Name"]].drop_duplicate_positions()


# generate output bed file of proximal and distal PAS
# utr3_out_bed = pr.concat([sel_le_d3utr_prox[["Score", "Name"]],
#                           sel_novel_le_d3utr[["Score", "Name"]]]
#                           )

# print(len(utr3_out_bed))
# # drop duplicate entries
# utr3_out_bed = utr3_out_bed.apply(lambda df: df.drop_duplicates()).sort()

# utr3_out_bed

Unnamed: 0,Chromosome,Start,End,Score,Strand,Name
0,chr1,76871267,76871821,.,+,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic
1,chr1,61824444,61825501,.,+,ENSG00000132849.22_1|PATJ|spliced|cryptic
2,chr1,245464258,245471621,.,+,ENSG00000162849.16_2|KIF26B|spliced|cryptic
3,chr1,1616614,1619210,.,+,ENSG00000197530.13_1|MIB2|spliced|cryptic
4,chr1,112540178,112542090,.,-,ENSG00000007341.19_4|ST7L|spliced|cryptic
...,...,...,...,...,...,...
128,chr22,37464524,37465718,.,-,ENSG00000100060.18_2|MFNG|spliced|cryptic
129,chrX,102721363,102724864,.,+,ENSG00000198908.12_1|BHLHB9|spliced|cryptic
130,chrX,98679426,98679978,.,+,ENSG00000281566.3_1|ENSG00000281566|spliced|cr...
131,chrX,17835910,17837395,.,-,ENSG00000131831.18_1|RAI2|spliced|cryptic


In [49]:
spliced_out_bed.subset(lambda df: df.Name.str.contains("STMN2", regex=False))

Unnamed: 0,Chromosome,Start,End,Score,Strand,Name
0,chr8,79616821,79617073,.,+,ENSG00000104435.14_1|STMN2|spliced|cryptic
1,chr8,79616821,79617049,.,+,ENSG00000104435.14_1|STMN2|spliced|cryptic
2,chr8,79616821,79617071,.,+,ENSG00000104435.14_1|STMN2|spliced|cryptic


In [53]:
Counter([k if le_id.split("|")[0] in v else None for le_id in spliced_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_spliced.items()])

Counter({None: 526, 'atlas_1_pred': 62, 'atlas_max_datasets': 32})

In [33]:
bleed_out_bed.subset(lambda df: df.Name.str.contains("cryptic$"))

Unnamed: 0,Chromosome,Start,End,Score,Strand,Name
0,chr1,147622531,147623356,.,+,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic
1,chr1,21453372,21457150,.,+,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic
2,chr1,156139102,156140091,.,+,ENSG00000160789.24_11|LMNA|bleedthrough|cryptic
3,chr1,149834134,149835093,.,+,ENSG00000270882.3_2|H4C14|bleedthrough|cryptic
4,chr1,45013791,45013889,.,+,ENSG00000126088.14_1|UROD|bleedthrough|cryptic
...,...,...,...,...,...,...
87,chr21,42849225,42850040,.,-,ENSG00000160193.12_1|WDR4|bleedthrough|cryptic
88,chrX,91882906,91891321,.,+,ENSG00000102290.23_3|PCDH11X|bleedthrough|cryptic
89,chrX,107087378,107088435,.,-,ENSG00000089682.17_2|RBM41|bleedthrough|cryptic
90,chrX,40653641,40654363,.,-,ENSG00000180182.11_3|MED14|bleedthrough|cryptic


In [54]:
Counter([k if le_id.split("|")[0] in v else None for le_id in bleed_out_bed.drop_duplicate_positions().as_df()["Name"].value_counts().loc[lambda x: x > 1].index for k,v in rep_choices_spliced.items()])

Counter({None: 728,
         'atlas_1_pred': 23,
         'motif_shortest_min': 13,
         'atlas_max_datasets': 9,
         'motif_1_min': 2})

In [None]:
if not os.path.exists("../processed/iclip_regions"):
    os.makedirs("../processed/iclip_regions")

#### TEMPORARY - work out why duplicate regions/multiple PAS coords for some last exons. To prevent duplciats slect longest per interval
spliced_out_bed = select_rep_prox_site(spliced_out_bed, "Name")
bleed_out_bed = select_rep_prox_site(bleed_out_bed, "Name")

spliced_out_bed.to_bed("../processed/iclip_regions/2023-06-26_papa_cryptic_spliced.last_exons.bed")
# also make one with just polYA sites
spliced_out_bed.three_end().to_bed("../processed/iclip_regions/2023-06-26_papa_cryptic_spliced.pas.bed")
# also first coordinate of exon (i.e. interval ending at this coordinate = splice site end, or substract 1 (strand-aware) to get final nucleotide of splice site)
spliced_out_bed.five_end().to_bed("../processed/iclip_regions/2023-06-26_papa_cryptic_spliced.le_start.bed")

bleed_out_bed.to_bed("../processed/iclip_regions/2023-06-26_papa_cryptic_bleedthrough.last_exons.bed")
# also make one with just polYA sites
bleed_out_bed.three_end().to_bed("../processed/iclip_regions/2023-06-26_papa_cryptic_bleedthrough.pas.bed")
# also first coordinate of exon (i.e. 1st nucleotide of intron
bleed_out_bed.five_end().to_bed("../processed/iclip_regions/2023-06-26_papa_cryptic_bleedthrough.le_start.bed")