In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from helpers import get_terminal_regions, _df_update_3p, _df_update_5p, add_region_number
import os

In [2]:
# Prioritised Zeng et al. target genes with PAPA annotations
zeng_targets_papa = "ARHGAP32 CNPY3 RFNG SIX3 STMN2 TLX1 SLIT3".split(" ")
zeng_targets_papa


['ARHGAP32', 'CNPY3', 'RFNG', 'SIX3', 'STMN2', 'TLX1', 'EGFR', 'SLIT3']

In [3]:
outdir = "processed/isoform_specific/regions/"
zeng_bed = pr.PyRanges(pd.read_csv(os.path.join(outdir, "zeng_pas_fig2c.updated.bed.tsv"), sep="\t"))
zeng_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id
0,chr1,629997,629998,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,630367,630368,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,859444,859445,LINC01128,.,+,LINC01128,chr1:859444:859445:+:LINC01128
4,chr1,1011462,1011463,ISG15,.,+,ISG15,chr1:1011462:1011463:+:ISG15
...,...,...,...,...,...,...,...,...
16096,chrY,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
16097,chrY,19691944,19691945,,.,-,ENSG00000260197,chrY:19691944:19691945:-:ENSG00000260197
16098,chrY,19692490,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197
16099,chrY,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D


In [4]:
# load in BED file of PAPA cryptics
papa_cryp_bed = pr.read_bed("data/2023-12-15_all.last_exons.cryptic.bed")

# get le_ids for cryptics (+ their annotated partners)
# e.g. ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic	
papa_name_spl = papa_cryp_bed.Name.str.split("\\|", expand=True)
cryp_le_ids = set(papa_name_spl[0])
cryp_gn = set(papa_name_spl[1])
print(f"Number of PAPA cryptics - {len(cryp_le_ids)}")
print(cryp_le_ids)
papa_cryp_bed

Number of PAPA cryptics - 284
{'ENSG00000109472.14_2', 'ENSG00000105808.19_5', 'ENSG00000100299.18_2', 'ENSG00000228824.8_4', 'ENSG00000135090.14_5', 'ENSG00000145819.18_3', 'ENSG00000139767.10_2', 'ENSG00000081760.17_1', 'ENSG00000186615.12_2', 'ENSG00000108587.16_4', 'ENSG00000131375.10_3', 'ENSG00000231312.8_4', 'ENSG00000105808.19_4', 'ENSG00000132849.22_1', 'ENSG00000078269.15_1', 'ENSG00000066032.19_6', 'ENSG00000105926.16_2', 'ENSG00000065526.12_4', 'ENSG00000155897.10_1', 'ENSG00000254602.2_1', 'ENSG00000121904.19_3', 'ENSG00000152954.12_1', 'ENSG00000075035.10_4', 'ENSG00000151692.15_7', 'ENSG00000117069.15_2', 'ENSG00000066379.15_1', 'ENSG00000100557.10_3', 'ENSG00000101624.11_3', 'ENSG00000119912.18_4', 'ENSG00000035141.8_3', 'ENSG00000253741.3_1', 'ENSG00000244115.1_2', 'ENSG00000122783.17_3', 'ENSG00000204314.12_1', 'ENSG00000203546.7,ENSG00000129480.13_2', 'ENSG00000148814.18_3', 'ENSG00000127995.17_2', 'ENSG00000170579.17_1', 'ENSG00000066032.19_7', 'ENSG00000153094.24_2

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,15939295,15940456,ENSG00000065526.12_3|SPEN|proximal|cryptic,.,+
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+
2,chr1,177280411,177282422,ENSG00000198797.7_1|BRINP2|proximal|cryptic,.,+
3,chr1,177280411,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+
4,chr1,76871267,76871821,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.,+
...,...,...,...,...,...,...
279,chrX,123600573,123601338,ENSG00000125676.20_3|THOC2|distal|cryptic,.,-
280,chrX,17835910,17837395,ENSG00000131831.18_1|RAI2|spliced|cryptic,.,-
281,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-
282,chrX,40653641,40654363,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,.,-


In [5]:
# get Zeng IDs that are missing from iCLIP GTFs (+ add TLX1 which is misannotated)
zeng_targets_papa_missing = set(zeng_targets_papa).difference(cryp_gn).union({"TLX1"})
zeng_targets_papa_missing

{'EGFR', 'RFNG', 'TLX1'}

In [6]:
papa_gtf = pr.read_gtf("data/novel_ref_combined.last_exons.gtf")
papa_gtf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,24419290,24420128,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24419290,24419640,ENST00000003912.7,488.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,1.0,ENSG00000001461.17_1
1,chr1,.,exon,24421813,24422110,.,+,.,ENSG00000001461.17,NIPAL3,...,last,244196402441964024419640,244401712444017124433103,"ENST00000358028.8,ENST00000374399.9,ENST000000...","NULL,NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,2.0,ENSG00000001461.17_2
2,chr1,.,exon,24454053,24454824,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2445350424453504,2445613724456137,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,3.0,ENSG00000001461.17_3
3,chr1,.,exon,24464025,24466378,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24464025,24464120,ENST00000003912.7,2258.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,4.0,ENSG00000001461.17_4
4,chr1,.,exon,24468985,24472976,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2446412024464120,2446898524468985,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",last_exon_spliced,ENSG00000001461.17,NIPAL3,5.0,ENSG00000001461.17_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124061,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
124062,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
124063,chrY,.,exon,21880307,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
124064,chrY,.,exon,21038288,21039044,.,-,.,ENSG00000254488.1,ENSG00000254488,...,last,21039044,21042268,ENST00000527562.1,,last_exon_spliced,ENSG00000254488.1,ENSG00000254488,1.0,ENSG00000254488.1_1


In [7]:
papa_gtf.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_name', 'transcript_id', 'exon_number',
       'region_rank', 'Start_ref', 'End_ref', 'transcript_id_ref',
       '3p_extension_length', 'event_type', 'ref_gene_id', 'ref_gene_name',
       'le_number', 'le_id'],
      dtype='object')

## Clean up TLX1 and RFNG

xx

In [12]:
# subset to novel predictions (never need annotated, so just overwrite existing object)
papa_gtf_annot = papa_gtf.subset(lambda df: ~df.transcript_id.str.startswith("PAPA"))
papa_gtf_novel = papa_gtf.subset(lambda df: df.transcript_id.str.startswith("PAPA"))

In [13]:
# subset to Zeng et al. target genes (+ TLX1)
papa_gtf_novel_sel = papa_gtf_novel.subset(lambda df: df.ref_gene_name.isin(zeng_targets_papa_missing))
papa_gtf_novel_sel


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr10,.,exon,101134174,101138530,.,+,.,PAPA.TDP-1.13599,,...,,,,,,first_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
1,chr10,.,exon,101134174,101138530,.,+,.,PAPA.TDP43-F_S6.10958,,...,,,,,,first_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
2,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
3,chr10,.,exon,101136690,101138522,.,+,.,PAPA.TDP43_19065409_S29.14398,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
4,chr10,.,exon,101136690,101138528,.,+,.,PAPA.TDP43_ctrl_2.14536,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
5,chr17,.,exon,82045104,82048807,.,-,.,PAPA.TDP43_19065409_S29.22294,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
6,chr17,.,exon,82045147,82048807,.,-,.,PAPA.NT_19074723_S36.21380,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
7,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
8,chr17,.,exon,82045423,82048807,.,-,.,PAPA.TDP-4.21887,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


### TLX1

In [14]:
papa_cryp_bed.subset(lambda df: df.Name.str.contains("TLX1"))

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr10,101134376,101138530,ENSG00000107807.13_1|TLX1|proximal|cryptic,.,+
1,chr10,101136690,101138531,ENSG00000107807.13_2|TLX1|distal|cryptic,.,+


In [15]:
papa_gtf_tlx1_prox = papa_gtf_annot.subset(lambda df: df.Start == 101136690)
# distal PAS can be plucked straight as selected in BED file
papa_gtf_tlx1_dist = papa_gtf_novel.subset(lambda df: df.End == 101138531) 

# now make proximal by overlapping with Zeng PAS and truncating
tlx1 = pr.concat([papa_gtf_tlx1_prox, papa_gtf_tlx1_dist])
tlx1

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr10,.,exon,101136690,101137789,.,+,.,ENSG00000107807.13,TLX1,...,last,1.0113437610113437e+17,1.0113669010113666e+17,"ENST00000370196.11,ENST00000467928.2","NULL,NULL",last_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
1,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2


### RFNG

Problem here was a slight bug with PAPA grouping extensions together with the annot event

In [34]:
rfng_annot = papa_gtf_annot.subset(lambda df: df.ref_gene_name == "RFNG")
rfng_novel = papa_gtf_novel.subset(lambda df: df.ref_gene_name == "RFNG")
rfng_annot

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
1,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
2,chr17,.,exon,82047901,82049116,.,-,.,ENSG00000169733.12,RFNG,...,last,82049030,82049116,ENST00000310496.9,1129.0,internal_exon_extension,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
3,chr17,.,exon,82048444,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [35]:
# pick longest annotated RFNG PAS
rfng_prox = rfng_annot.subset(lambda df: (df.Start == 82047901 ) & (df.End == 82048807)).drop_duplicate_positions()
rfng_prox

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [36]:
# manual curation suggests x is one to pick (s)
rfng_dist = rfng_novel.subset(lambda df: df.Start == 82045150)
rfng_dist


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
52583,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [38]:
rfng = pr.concat([rfng_prox, rfng_dist]).sort()
rfng

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
1,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [39]:
rfng_tlx1 = pr.concat([rfng, tlx1]).sort()
rfng_tlx1

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr10,.,exon,101136690,101137789,.,+,.,ENSG00000107807.13,TLX1,...,last,101134376101134376,101136690101136667,"ENST00000370196.11,ENST00000467928.2","NULL,NULL",last_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
1,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
2,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
3,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


## Generating bins for all ALE events

In [44]:
# remove tlx1 from analysis 
papa_cryp_bed_upd = papa_cryp_bed.subset(lambda df: ~df.Name.str.contains("TLX1"))
# pull out gene name
papa_cryp_bed_upd = papa_cryp_bed_upd.assign("gene_name", lambda df: df.Name.str.split("\\|", expand=True)[1])
papa_cryp_bed_upd

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name
0,chr1,15939295,15940456,ENSG00000065526.12_3|SPEN|proximal|cryptic,.,+,SPEN
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN
2,chr1,177280411,177282422,ENSG00000198797.7_1|BRINP2|proximal|cryptic,.,+,BRINP2
3,chr1,177280411,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+,BRINP2
4,chr1,76871267,76871821,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.,+,ST6GALNAC5
...,...,...,...,...,...,...,...
277,chrX,123600573,123601338,ENSG00000125676.20_3|THOC2|distal|cryptic,.,-,THOC2
278,chrX,17835910,17837395,ENSG00000131831.18_1|RAI2|spliced|cryptic,.,-,RAI2
279,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,COL4A6
280,chrX,40653641,40654363,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,.,-,MED14


In [48]:
# annotate prox and distal 
papa_cryp_bed_upd = add_region_number(papa_cryp_bed_upd.assign("Feature", lambda df: pd.Series(["exon"]*len(df), index=df.index)),
                  id_col="gene_name",
                  feature_key="exon",
                  out_col="pas_number",
                  sort_col="End",
                  ).sort()

papa_cryp_bed_upd

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name,Feature,pas_number
0,chr1,15939295,15940456,ENSG00000065526.12_3|SPEN|proximal|cryptic,.,+,SPEN,exon,1.0
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0
2,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+,NBPF3,exon,1.0
3,chr1,54634687,54639192,ENSG00000162390.18_4|ACOT11|spliced|cryptic,.,+,ACOT11,exon,1.0
4,chr1,61824444,61825501,ENSG00000132849.22_1|PATJ|spliced|cryptic,.,+,PATJ,exon,1.0
...,...,...,...,...,...,...,...,...,...
277,chrX,47635520,47636927,ENSG00000126767.18_1|ELK1|proximal|cryptic,.,-,ELK1,exon,1.0
278,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,COL4A6,exon,1.0
279,chrX,123600550,123603748,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,exon,1.0
280,chrX,123600573,123601338,ENSG00000125676.20_3|THOC2|distal|cryptic,.,-,THOC2,exon,2.0


In [49]:
papa_cryp_bed_upd.subset(lambda df: df.gene_name == "ELK1")

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name,Feature,pas_number
0,chrX,47631618,47636927,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,exon,1.0
1,chrX,47635520,47636927,ENSG00000126767.18_1|ELK1|proximal|cryptic,.,-,ELK1,exon,1.0
