In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from helpers import get_terminal_regions, _df_update_3p, _df_update_5p, add_region_number
import os

In [2]:
# Prioritised Zeng et al. target genes with PAPA annotations
zeng_targets_papa = "ARHGAP32 CNPY3 RFNG SIX3 STMN2 TLX1 SLIT3".split(" ")
zeng_targets_papa


['ARHGAP32', 'CNPY3', 'RFNG', 'SIX3', 'STMN2', 'TLX1', 'SLIT3']

In [3]:
outdir = "processed/isoform_specific/regions/"
zeng_bed = pr.PyRanges(pd.read_csv(os.path.join(outdir, "zeng_pas_fig2c.updated.bed.tsv"), sep="\t"))
zeng_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id
0,chr1,629997,629998,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,630367,630368,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,859444,859445,LINC01128,.,+,LINC01128,chr1:859444:859445:+:LINC01128
4,chr1,1011462,1011463,ISG15,.,+,ISG15,chr1:1011462:1011463:+:ISG15
...,...,...,...,...,...,...,...,...
16096,chrY,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
16097,chrY,19691944,19691945,,.,-,ENSG00000260197,chrY:19691944:19691945:-:ENSG00000260197
16098,chrY,19692490,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197
16099,chrY,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D


In [4]:
# load in BED file of PAPA cryptics
papa_cryp_bed = pr.read_bed("data/2023-12-15_all.last_exons.cryptic.bed")

# get le_ids for cryptics (+ their annotated partners)
# e.g. ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic	
papa_name_spl = papa_cryp_bed.Name.str.split("\\|", expand=True)
cryp_le_ids = set(papa_name_spl[0])
cryp_gn = set(papa_name_spl[1])
print(f"Number of PAPA cryptics - {len(cryp_le_ids)}")
print(cryp_le_ids)
papa_cryp_bed

Number of PAPA cryptics - 284
{'ENSG00000139767.10_3', 'ENSG00000128487.19_1', 'ENSG00000162849.16_2', 'ENSG00000131375.10_2', 'ENSG00000164038.16_1', 'ENSG00000171094.18_1', 'ENSG00000138802.11_2', 'ENSG00000102678.7_2', 'ENSG00000101412.13_2', 'ENSG00000065457.11_1', 'ENSG00000126767.18_2', 'ENSG00000135315.12_2', 'ENSG00000213020.10_1', 'ENSG00000128708.13_3', 'ENSG00000197497.11_3', 'ENSG00000132394.11_3', 'ENSG00000253741.3_2', 'ENSG00000091009.8_1', 'ENSG00000247199.6_1', 'ENSG00000169856.9_3', 'ENSG00000188191.15_3', 'ENSG00000166135.14_3', 'ENSG00000100299.18_1', 'ENSG00000109534.17_4', 'ENSG00000265808.4_3', 'ENSG00000117151.13_1', 'ENSG00000198797.7_2', 'ENSG00000075035.10_3', 'ENSG00000135315.12_1', 'ENSG00000118922.18_1', 'ENSG00000204314.12_2', 'ENSG00000100557.10_4', 'ENSG00000166135.14_2', 'ENSG00000186908.15_2', 'ENSG00000163964.18_2', 'ENSG00000204764.14_9', 'ENSG00000026652.15_3', 'ENSG00000186615.12_2', 'ENSG00000169856.9_1', 'ENSG00000197563.11_3', 'ENSG00000075035.

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,15939295,15940456,ENSG00000065526.12_3|SPEN|proximal|cryptic,.,+
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+
2,chr1,177280411,177282422,ENSG00000198797.7_1|BRINP2|proximal|cryptic,.,+
3,chr1,177280411,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+
4,chr1,76871267,76871821,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.,+
...,...,...,...,...,...,...
279,chrX,123600573,123601338,ENSG00000125676.20_3|THOC2|distal|cryptic,.,-
280,chrX,17835910,17837395,ENSG00000131831.18_1|RAI2|spliced|cryptic,.,-
281,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-
282,chrX,40653641,40654363,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,.,-


In [5]:
# get Zeng IDs that are missing from iCLIP GTFs (+ add TLX1 which is misannotated)
zeng_targets_papa_missing = set(zeng_targets_papa).difference(cryp_gn).union({"TLX1"})
zeng_targets_papa_missing

{'RFNG', 'TLX1'}

In [6]:
papa_gtf = pr.read_gtf("data/novel_ref_combined.last_exons.gtf")
papa_gtf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,24419290,24420128,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24419290,24419640,ENST00000003912.7,488.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,1.0,ENSG00000001461.17_1
1,chr1,.,exon,24421813,24422110,.,+,.,ENSG00000001461.17,NIPAL3,...,last,244196402441964024419640,244401712444017124433103,"ENST00000358028.8,ENST00000374399.9,ENST000000...","NULL,NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,2.0,ENSG00000001461.17_2
2,chr1,.,exon,24454053,24454824,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2445350424453504,2445613724456137,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,3.0,ENSG00000001461.17_3
3,chr1,.,exon,24464025,24466378,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24464025,24464120,ENST00000003912.7,2258.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,4.0,ENSG00000001461.17_4
4,chr1,.,exon,24468985,24472976,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2446412024464120,2446898524468985,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",last_exon_spliced,ENSG00000001461.17,NIPAL3,5.0,ENSG00000001461.17_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124061,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
124062,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
124063,chrY,.,exon,21880307,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
124064,chrY,.,exon,21038288,21039044,.,-,.,ENSG00000254488.1,ENSG00000254488,...,last,21039044,21042268,ENST00000527562.1,,last_exon_spliced,ENSG00000254488.1,ENSG00000254488,1.0,ENSG00000254488.1_1


In [7]:
papa_gtf.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_name', 'transcript_id', 'exon_number',
       'region_rank', 'Start_ref', 'End_ref', 'transcript_id_ref',
       '3p_extension_length', 'event_type', 'ref_gene_id', 'ref_gene_name',
       'le_number', 'le_id'],
      dtype='object')

## Extract partner, annotated/alt last exon for IPA and ALEs

Bed file only has alt ALE for subset of cryptic ALE/IPAs. Extract them for quantification GTF using ofllowing criteria:
- Cryptic ALE/IPA genes
- Extract most distal non-cryptic le_id
- Merge exons assigned to le_id, keep minimal features for gencode

In [113]:
# extract ids

papa_cryp_bed_ale_ipa = papa_cryp_bed.subset(lambda df: df.Name.str.contains("spliced|bleedthrough"))
# ENSG00000162849.16_2|KIF26B|spliced|cryptic 
name_split_ale_ipa = papa_cryp_bed_ale_ipa.Name.str.split("\\|", expand=True)
cryp_le_ids_ale_ipa = set(name_split_ale_ipa[0])
cryp_gn_ale_ipa = set(name_split_ale_ipa[1])

papa_gtf_cryp_genes = papa_gtf.subset(lambda df: df.ref_gene_name.isin(cryp_gn_ale_ipa))

# remove cryptic le_ids
papa_gtf_cryp_genes_noncryp = papa_gtf_cryp_genes.subset(lambda df: ~df.le_id.isin(cryp_le_ids_ale_ipa))
papa_gtf_cryp_genes_noncryp


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21446036,21446482,.,+,.,ENSG00000142794.19,NBPF3,...,last,2144521921445219214452192144521921445219,2147063121468687214686872146608621466086,"ENST00000454000.6,ENST00000318249.10,ENST00000...","NULL,NULL,NULL,NULL,NULL",internal_exon_spliced,ENSG00000142794.19,NBPF3,1.0,ENSG00000142794.19_1
1,chr1,.,exon,21453372,21454306,.,+,.,ENSG00000142794.19,NBPF3,...,last,2144521921445219214452192144521921445219,2147063121468687214686872146608621466086,"ENST00000454000.6,ENST00000318249.10,ENST00000...","NULL,NULL,NULL,NULL,NULL",internal_exon_spliced,ENSG00000142794.19,NBPF3,2.0,ENSG00000142794.19_2
2,chr1,.,exon,21454855,21456901,.,+,.,PAPA.CTRL-2.310,,...,,,,,,"internal_exon_spliced,last_exon_spliced",ENSG00000142794.19,NBPF3,2.0,ENSG00000142794.19_2
3,chr1,.,exon,21483142,21485005,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4.0,ENSG00000142794.19_4
4,chr1,.,exon,21483142,21484900,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4.0,ENSG00000142794.19_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,chrX,.,exon,131689528,131692636,.,-,.,PAPA.CTRL-6.24702,,...,,,,,,internal_exon_spliced,ENSG00000213468.7,FIRRE,4.0,ENSG00000213468.7_4
792,chrX,.,exon,131689804,131692636,.,-,.,PAPA.ctrl_ctrl_2.27739,,...,,,,,,internal_exon_spliced,ENSG00000213468.7,FIRRE,4.0,ENSG00000213468.7_4
793,chrX,.,exon,108155606,108157260,.,-,.,ENSG00000197565.17,COL4A6,...,last,108157260108157260108157260,108159461108159461108159461,"ENST00000372216.8,ENST00000538570.5,ENST000006...","NULL,NULL,NULL",last_exon_spliced,ENSG00000197565.17,COL4A6,2.0,ENSG00000197565.17_2
794,chrX,.,exon,108155607,108157260,.,-,.,ENSG00000197565.17,COL4A6,...,last,108157260108157260108157260,108159461108159461108159461,"ENST00000372216.8,ENST00000538570.5,ENST000006...","NULL,NULL,NULL",last_exon_spliced,ENSG00000197565.17,COL4A6,2.0,ENSG00000197565.17_2


In [140]:
# extract all intervals associated with most distal ALE
papa_gtf_cryp_genes_noncryp.le_number = papa_gtf_cryp_genes_noncryp.le_number.astype(float).astype(int)
# pull out most distal isoform per gene
le_ids_most_distal =  set(get_terminal_regions(papa_gtf_cryp_genes_noncryp, id_col="ref_gene_id", region_number_col="le_number").le_id)
papa_gtf_ale_ipa_alt = papa_gtf_cryp_genes_noncryp.subset(lambda df: df.le_id.isin(le_ids_most_distal))
papa_gtf_ale_ipa_alt

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21483142,21485005,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
1,chr1,.,exon,21483142,21484900,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
2,chr1,.,exon,21483142,21484900,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
3,chr1,.,exon,21483142,21483467,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
4,chr1,.,exon,21483142,21483412,.,+,.,ENSG00000142794.19,NBPF3,...,last,2148253521482535214825352148253521482535,2148314221483142214831422148314221483142,"ENST00000318220.10,ENST00000318249.10,ENST0000...","NULL,NULL,NULL,NULL,NULL",last_exon_spliced,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,chrX,.,exon,131689528,131692636,.,-,.,PAPA.CTRL-6.24702,,...,,,,,,internal_exon_spliced,ENSG00000213468.7,FIRRE,4,ENSG00000213468.7_4
345,chrX,.,exon,131689804,131692636,.,-,.,PAPA.ctrl_ctrl_2.27739,,...,,,,,,internal_exon_spliced,ENSG00000213468.7,FIRRE,4,ENSG00000213468.7_4
346,chrX,.,exon,108155606,108157260,.,-,.,ENSG00000197565.17,COL4A6,...,last,108157260108157260108157260,108159461108159461108159461,"ENST00000372216.8,ENST00000538570.5,ENST000006...","NULL,NULL,NULL",last_exon_spliced,ENSG00000197565.17,COL4A6,2,ENSG00000197565.17_2
347,chrX,.,exon,108155607,108157260,.,-,.,ENSG00000197565.17,COL4A6,...,last,108157260108157260108157260,108159461108159461108159461,"ENST00000372216.8,ENST00000538570.5,ENST000006...","NULL,NULL,NULL",last_exon_spliced,ENSG00000197565.17,COL4A6,2,ENSG00000197565.17_2


In [141]:
# merge le_ids to sinlge interval
papa_gtf_ale_ipa_alt_m = papa_gtf_ale_ipa_alt.merge(by="le_id", strand=True)
papa_gtf_ale_ipa_alt_m

Unnamed: 0,Chromosome,Start,End,Strand,le_id
0,chr1,77062974,77067546,+,ENSG00000117069.15_3
1,chr1,62178422,62178675,+,ENSG00000132849.22_8
2,chr1,21483142,21485005,+,ENSG00000142794.19_4
3,chr1,54608956,54610329,+,ENSG00000162390.18_3
4,chr1,245702457,245709432,+,ENSG00000162849.16_3
...,...,...,...,...,...
101,chrX,98864610,98867720,+,ENSG00000281566.3_3
102,chrX,17799730,17802034,-,ENSG00000131831.18_3
103,chrX,40648304,40651879,-,ENSG00000180182.11_4
104,chrX,108155606,108157260,-,ENSG00000197565.17_2


In [142]:
# extract minimal annotation per le_id + add to merged intervals
meta_ale_ipa_alt = papa_gtf_ale_ipa_alt.as_df()[["Feature", "le_id", "ref_gene_id", "ref_gene_name", "le_number"]].rename(columns={"ref_gene_id": "gene_id", "ref_gene_name": "gene_name", "le_number": "pas_number"}).drop_duplicates()
papa_gtf_ale_ipa_alt_m = papa_gtf_ale_ipa_alt_m.apply(lambda df: df.merge(meta_ale_ipa_alt, on="le_id", how="left")).sort()
# add transcript id (assign to le-id)
papa_gtf_ale_ipa_alt_m.transcript_id = papa_gtf_ale_ipa_alt_m.le_id
papa_gtf_ale_ipa_alt_m

Unnamed: 0,Chromosome,Start,End,Strand,le_id,Feature,gene_id,gene_name,pas_number,transcript_id
0,chr1,21483142,21485005,+,ENSG00000142794.19_4,exon,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
1,chr1,54608956,54610329,+,ENSG00000162390.18_3,exon,ENSG00000162390.18,ACOT11,3,ENSG00000162390.18_3
2,chr1,62178422,62178675,+,ENSG00000132849.22_8,exon,ENSG00000132849.22,PATJ,8,ENSG00000132849.22_8
3,chr1,77062974,77067546,+,ENSG00000117069.15_3,exon,ENSG00000117069.15,ST6GALNAC5,3,ENSG00000117069.15_3
4,chr1,245702457,245709432,+,ENSG00000162849.16_3,exon,ENSG00000162849.16,KIF26B,3,ENSG00000162849.16_3
...,...,...,...,...,...,...,...,...,...,...
101,chrX,102748698,102753658,+,ENSG00000198908.12_3,exon,ENSG00000198908.12,BHLHB9,3,ENSG00000198908.12_3
102,chrX,17799730,17802034,-,ENSG00000131831.18_3,exon,ENSG00000131831.18,RAI2,3,ENSG00000131831.18_3
103,chrX,40648304,40651879,-,ENSG00000180182.11_4,exon,ENSG00000180182.11,MED14,4,ENSG00000180182.11_4
104,chrX,108155606,108157260,-,ENSG00000197565.17_2,exon,ENSG00000197565.17,COL4A6,2,ENSG00000197565.17_2


In [143]:
# get feature counts ready i.e. make copies with 'transcript' and 'gene' in the Feature col

grs = [papa_gtf_ale_ipa_alt_m.assign("Feature", lambda df: pd.Series([feat]*len(df), index=df.index)) for feat in ["transcript", "gene"]]
papa_gtf_ale_ipa_alt_m
# combine into single gr
papa_gtf_ale_ipa_alt_m = pr.concat([papa_gtf_ale_ipa_alt_m, *grs]).sort()
papa_gtf_ale_ipa_alt_m

Unnamed: 0,Chromosome,Start,End,Strand,le_id,Feature,gene_id,gene_name,pas_number,transcript_id
0,chr1,21483142,21485005,+,ENSG00000142794.19_4,exon,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
1,chr1,21483142,21485005,+,ENSG00000142794.19_4,transcript,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
2,chr1,21483142,21485005,+,ENSG00000142794.19_4,gene,ENSG00000142794.19,NBPF3,4,ENSG00000142794.19_4
3,chr1,54608956,54610329,+,ENSG00000162390.18_3,exon,ENSG00000162390.18,ACOT11,3,ENSG00000162390.18_3
4,chr1,54608956,54610329,+,ENSG00000162390.18_3,transcript,ENSG00000162390.18,ACOT11,3,ENSG00000162390.18_3
...,...,...,...,...,...,...,...,...,...,...
313,chrX,108155606,108157260,-,ENSG00000197565.17_2,transcript,ENSG00000197565.17,COL4A6,2,ENSG00000197565.17_2
314,chrX,108155606,108157260,-,ENSG00000197565.17_2,gene,ENSG00000197565.17,COL4A6,2,ENSG00000197565.17_2
315,chrX,131688563,131692636,-,ENSG00000213468.7_4,exon,ENSG00000213468.7,FIRRE,4,ENSG00000213468.7_4
316,chrX,131688563,131692636,-,ENSG00000213468.7_4,transcript,ENSG00000213468.7,FIRRE,4,ENSG00000213468.7_4


## Clean up TLX1 and RFNG

xx

In [8]:
# subset to novel predictions (never need annotated, so just overwrite existing object)
papa_gtf_annot = papa_gtf.subset(lambda df: ~df.transcript_id.str.startswith("PAPA"))
papa_gtf_novel = papa_gtf.subset(lambda df: df.transcript_id.str.startswith("PAPA"))

In [9]:
# subset to Zeng et al. target genes (+ TLX1)
papa_gtf_novel_sel = papa_gtf_novel.subset(lambda df: df.ref_gene_name.isin(zeng_targets_papa_missing))
papa_gtf_novel_sel


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr10,.,exon,101134174,101138530,.,+,.,PAPA.TDP-1.13599,,...,,,,,,first_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
1,chr10,.,exon,101134174,101138530,.,+,.,PAPA.TDP43-F_S6.10958,,...,,,,,,first_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
2,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
3,chr10,.,exon,101136690,101138522,.,+,.,PAPA.TDP43_19065409_S29.14398,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
4,chr10,.,exon,101136690,101138528,.,+,.,PAPA.TDP43_ctrl_2.14536,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
5,chr17,.,exon,82045104,82048807,.,-,.,PAPA.TDP43_19065409_S29.22294,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
6,chr17,.,exon,82045147,82048807,.,-,.,PAPA.NT_19074723_S36.21380,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
7,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
8,chr17,.,exon,82045423,82048807,.,-,.,PAPA.TDP-4.21887,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


### TLX1

In [10]:
papa_cryp_bed.subset(lambda df: df.Name.str.contains("TLX1"))

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr10,101134376,101138530,ENSG00000107807.13_1|TLX1|proximal|cryptic,.,+
1,chr10,101136690,101138531,ENSG00000107807.13_2|TLX1|distal|cryptic,.,+


In [11]:
papa_gtf_tlx1_prox = papa_gtf_annot.subset(lambda df: df.Start == 101136690)
# distal PAS can be plucked straight as selected in BED file
papa_gtf_tlx1_dist = papa_gtf_novel.subset(lambda df: df.End == 101138531) 

# now make proximal by overlapping with Zeng PAS and truncating
tlx1 = pr.concat([papa_gtf_tlx1_prox, papa_gtf_tlx1_dist])
tlx1

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr10,.,exon,101136690,101137789,.,+,.,ENSG00000107807.13,TLX1,...,last,1.0113437610113437e+17,1.0113669010113666e+17,"ENST00000370196.11,ENST00000467928.2","NULL,NULL",last_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
1,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2


### RFNG

Problem here was a slight bug with PAPA grouping extensions together with the annot event

In [12]:
rfng_annot = papa_gtf_annot.subset(lambda df: df.ref_gene_name == "RFNG")
rfng_novel = papa_gtf_novel.subset(lambda df: df.ref_gene_name == "RFNG")
rfng_annot

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
1,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
2,chr17,.,exon,82047901,82049116,.,-,.,ENSG00000169733.12,RFNG,...,last,82049030,82049116,ENST00000310496.9,1129.0,internal_exon_extension,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
3,chr17,.,exon,82048444,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [13]:
# pick longest annotated RFNG PAS
rfng_prox = rfng_annot.subset(lambda df: (df.Start == 82047901 ) & (df.End == 82048807)).drop_duplicate_positions()
rfng_prox

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [14]:
# manual curation suggests x is one to pick (s)
rfng_dist = rfng_novel.subset(lambda df: df.Start == 82045150)
rfng_dist


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
52583,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [15]:
rfng = pr.concat([rfng_prox, rfng_dist]).sort()
rfng

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
1,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [16]:
rfng_tlx1 = pr.concat([rfng, tlx1]).sort()
rfng_tlx1

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr10,.,exon,101136690,101137789,.,+,.,ENSG00000107807.13,TLX1,...,last,101134376101134376,101136690101136667,"ENST00000370196.11,ENST00000467928.2","NULL,NULL",last_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1
1,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2
2,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1
3,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,last,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1


In [28]:
# assign proximal + distal pas, generate bins
rfng_tlx1 = add_region_number(rfng_tlx1,
                  id_col="ref_gene_name",
                  feature_key="exon",
                  out_col="pas_number",
                  sort_col="End",
                  method="first"
                  ).sort()

rfng_tlx1_prox = rfng_tlx1.subset(lambda df: df.pas_number == 1)
rfng_tlx1_dist = rfng_tlx1.subset(lambda df: df.pas_number == 2)

rfng_tlx1


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id,pas_number
0,chr10,.,exon,101136690,101137789,.,+,.,ENSG00000107807.13,TLX1,...,101134376101134376,101136690101136667,"ENST00000370196.11,ENST00000467928.2","NULL,NULL",last_exon_spliced,ENSG00000107807.13,TLX1,1.0,ENSG00000107807.13_1,1.0
1,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2,2.0
2,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1,2.0
3,chr17,.,exon,82047901,82048807,.,-,.,ENSG00000169733.12,RFNG,...,820488078204880782048807,820490308204903082049030,"ENST00000310496.9,ENST00000578676.5,ENST000005...","NULL,NULL,NULL",last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1,1.0


In [29]:
# Subtract prox from distal to get unique segemtn of distal cryptic
rfng_tlx1_dist_uniq = rfng_tlx1_dist.subtract(rfng_tlx1_prox, strandedness="same")
rfng_tlx1_dist_uniq

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id,pas_number
0,chr10,.,exon,101137789,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2,2.0
1,chr17,.,exon,82045150,82047901,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1,2.0


## Generating bins for all ALE events

In [17]:
# remove tlx1 from analysis 
papa_cryp_bed_upd = papa_cryp_bed.subset(lambda df: ~df.Name.str.contains("TLX1"))
# pull out gene name
papa_cryp_bed_upd = papa_cryp_bed_upd.assign("gene_name", lambda df: df.Name.str.split("\\|", expand=True)[1])
papa_cryp_bed_upd

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name
0,chr1,15939295,15940456,ENSG00000065526.12_3|SPEN|proximal|cryptic,.,+,SPEN
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN
2,chr1,177280411,177282422,ENSG00000198797.7_1|BRINP2|proximal|cryptic,.,+,BRINP2
3,chr1,177280411,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+,BRINP2
4,chr1,76871267,76871821,ENSG00000117069.15_2|ST6GALNAC5|spliced|cryptic,.,+,ST6GALNAC5
...,...,...,...,...,...,...,...
277,chrX,123600573,123601338,ENSG00000125676.20_3|THOC2|distal|cryptic,.,-,THOC2
278,chrX,17835910,17837395,ENSG00000131831.18_1|RAI2|spliced|cryptic,.,-,RAI2
279,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,COL4A6
280,chrX,40653641,40654363,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,.,-,MED14


In [30]:
# annotate prox and distal 
papa_cryp_bed_upd = add_region_number(papa_cryp_bed_upd.assign("Feature", lambda df: pd.Series(["exon"]*len(df), index=df.index)),
                  id_col="gene_name",
                  feature_key="exon",
                  out_col="pas_number",
                  sort_col="End",
                  method="first"
                  ).sort()

# extract to individual objects. Split by 3'UTRs to 
# papa_cryp_bed_upd_utr = papa_cryp_bed_upd.subset(lambda df: df.Name.str.contains("proximal|distal"))


papa_cryp_bed_upd_utr_prox = papa_cryp_bed_upd.subset(lambda df: df.pas_number == 1)
papa_cryp_bed_upd_utr_dist = papa_cryp_bed_upd.subset(lambda df: df.pas_number == 2)

papa_cryp_bed_upd


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name,Feature,pas_number
0,chr1,15939295,15940456,ENSG00000065526.12_3|SPEN|proximal|cryptic,.,+,SPEN,exon,1.0
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0
2,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+,NBPF3,exon,1.0
3,chr1,54634687,54639192,ENSG00000162390.18_4|ACOT11|spliced|cryptic,.,+,ACOT11,exon,1.0
4,chr1,61824444,61825501,ENSG00000132849.22_1|PATJ|spliced|cryptic,.,+,PATJ,exon,1.0
...,...,...,...,...,...,...,...,...,...
277,chrX,47635520,47636927,ENSG00000126767.18_1|ELK1|proximal|cryptic,.,-,ELK1,exon,1.0
278,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,COL4A6,exon,1.0
279,chrX,123600550,123603748,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,exon,2.0
280,chrX,123600573,123601338,ENSG00000125676.20_3|THOC2|distal|cryptic,.,-,THOC2,exon,1.0


In [31]:
# define distal specific region (ONLY for UTRs)
papa_cryp_dist_uniq = papa_cryp_dist.subtract(papa_cryp_prox, strandedness="same")
papa_cryp_dist_uniq

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name,Feature,pas_number
0,chr1,15940456,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0
1,chr1,177282422,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+,BRINP2,exon,2.0
2,chr1,71063238,71063290,ENSG00000132485.14_3|ZRANB2|distal|cryptic,.,-,ZRANB2,exon,2.0
3,chr1,111117005,111117162,ENSG00000156171.15_4|DRAM2|distal|cryptic,.,-,DRAM2,exon,2.0
4,chr1,120150756,120150897,ENSG00000265808.4_3|SEC22B|distal|cryptic,.,-,SEC22B,exon,2.0
...,...,...,...,...,...,...,...,...,...
89,chr22,50621309,50622753,ENSG00000100299.18_2|ARSA|distal|cryptic,.,-,ARSA,exon,2.0
90,chr22,50623027,50625464,ENSG00000100299.18_2|ARSA|distal|cryptic,.,-,ARSA,exon,2.0
91,chrX,47631618,47635520,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,exon,2.0
92,chrX,123600550,123600573,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,exon,2.0


## FeatureCounts compliant

featureCounts compliant GTF checklist:
- each interval reported with 'exon' assigned to Feature column
- Unique isoforms/intervals also represented as 'gene' & 'transcript' in Feature column
- minimal attributes - gene_id, transcript_id


In [161]:
# ALEs/IPAs dropped from papa_cryp_dist_uniq (because assigned as proximal)
papa_cryp_ale_ipa = papa_cryp_bed_upd.subset(lambda df: ~df.Name.str.contains("proximal|distal"))
# assign gene_id, transcript_id
# ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic
papa_cryp_ale_ipa.gene_id = papa_cryp_ale_ipa.Name.str.split("\\|", expand=True)[0].str.split("_", expand=True, regex=False)[0]
papa_cryp_ale_ipa.transcript_id = papa_cryp_ale_ipa.Name.str.split("\\|", expand=True)[0]
papa_cryp_ale_ipa.le_id = papa_cryp_ale_ipa.Name.str.split("\\|", expand=True)[0]
papa_cryp_ale_ipa = papa_cryp_ale_ipa.apply(lambda df: df.rename(columns={"Name": "papa_bed_name"}))

# 
grs = [papa_cryp_ale_ipa.assign("Feature", lambda df: pd.Series([feat]*len(df), index=df.index)) for feat in ["transcript", "gene"]]
# combine into single gr
papa_cryp_ale_ipa = pr.concat([papa_cryp_ale_ipa, *grs]).sort()
papa_cryp_ale_ipa



Unnamed: 0,Chromosome,Start,End,papa_bed_name,Score,Strand,gene_name,Feature,pas_number,gene_id,transcript_id,le_id
0,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+,NBPF3,exon,1.0,ENSG00000142794.19,ENSG00000142794.19_3,ENSG00000142794.19_3
1,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+,NBPF3,transcript,1.0,ENSG00000142794.19,ENSG00000142794.19_3,ENSG00000142794.19_3
2,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+,NBPF3,gene,1.0,ENSG00000142794.19,ENSG00000142794.19_3,ENSG00000142794.19_3
3,chr1,54634687,54639192,ENSG00000162390.18_4|ACOT11|spliced|cryptic,.,+,ACOT11,exon,1.0,ENSG00000162390.18,ENSG00000162390.18_4,ENSG00000162390.18_4
4,chr1,54634687,54639192,ENSG00000162390.18_4|ACOT11|spliced|cryptic,.,+,ACOT11,transcript,1.0,ENSG00000162390.18,ENSG00000162390.18_4,ENSG00000162390.18_4
...,...,...,...,...,...,...,...,...,...,...,...,...
331,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,COL4A6,transcript,1.0,ENSG00000197565.17,ENSG00000197565.17_1,ENSG00000197565.17_1
332,chrX,108267641,108269152,ENSG00000197565.17_1|COL4A6|spliced|cryptic,.,-,COL4A6,gene,1.0,ENSG00000197565.17,ENSG00000197565.17_1,ENSG00000197565.17_1
333,chrX,131823775,131825221,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,.,-,FIRRE,exon,1.0,ENSG00000213468.7,ENSG00000213468.7_1,ENSG00000213468.7_1
334,chrX,131823775,131825221,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,.,-,FIRRE,transcript,1.0,ENSG00000213468.7,ENSG00000213468.7_1,ENSG00000213468.7_1


In [34]:
# currently annotated with exon already
# extract gene_id from Name field
# ENSG00000126767.18_2|ELK1|distal|cryptic -> ENSG00000126767.18

# distal specific bins
papa_cryp_dist_uniq.gene_id = papa_cryp_dist_uniq.Name.str.split("\\|", expand=True)[0].str.split("_", expand=True, regex=False)[0]
# full length 
papa_cryp_dist.gene_id  = papa_cryp_dist.Name.str.split("\\|", expand=True)[0].str.split("_", expand=True, regex=False)[0]

papa_cryp_dist

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name,Feature,pas_number,gene_id
0,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0,ENSG00000065526.12
1,chr1,177280411,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+,BRINP2,exon,2.0,ENSG00000198797.7
2,chr1,71063238,71065137,ENSG00000132485.14_3|ZRANB2|distal|cryptic,.,-,ZRANB2,exon,2.0,ENSG00000132485.14
3,chr1,111117005,111118267,ENSG00000156171.15_4|DRAM2|distal|cryptic,.,-,DRAM2,exon,2.0,ENSG00000156171.15
4,chr1,120150756,120157192,ENSG00000265808.4_3|SEC22B|distal|cryptic,.,-,SEC22B,exon,2.0,ENSG00000265808.4
...,...,...,...,...,...,...,...,...,...,...
84,chr20,33675132,33676979,ENSG00000101412.13_2|E2F1|distal|cryptic,.,-,E2F1,exon,2.0,ENSG00000101412.13
85,chr22,42681021,42693997,ENSG00000128274.17_2|A4GALT|distal|cryptic,.,-,A4GALT,exon,2.0,ENSG00000128274.17
86,chr22,50621309,50625464,ENSG00000100299.18_2|ARSA|distal|cryptic,.,-,ARSA,exon,2.0,ENSG00000100299.18
87,chrX,47631618,47636927,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,exon,2.0,ENSG00000126767.18


In [63]:
# For PAPA defined events, am going to assign transcript ID as le_id 
# for distal unique regions, add suffix to transcript ID so represented separately
papa_cryp_dist_uniq.transcript_id = papa_cryp_dist_uniq.Name.str.split("\\|", expand=True)[0] + "__distal"
papa_cryp_dist.transcript_id = papa_cryp_dist.Name.str.split("\\|", expand=True)[0]

papa_cryp_dist

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name,Feature,pas_number,gene_id,transcript_id
0,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0,ENSG00000065526.12,ENSG00000065526.12_4
1,chr1,177280411,177284690,ENSG00000198797.7_2|BRINP2|distal|cryptic,.,+,BRINP2,exon,2.0,ENSG00000198797.7,ENSG00000198797.7_2
2,chr1,71063238,71065137,ENSG00000132485.14_3|ZRANB2|distal|cryptic,.,-,ZRANB2,exon,2.0,ENSG00000132485.14,ENSG00000132485.14_3
3,chr1,111117005,111118267,ENSG00000156171.15_4|DRAM2|distal|cryptic,.,-,DRAM2,exon,2.0,ENSG00000156171.15,ENSG00000156171.15_4
4,chr1,120150756,120157192,ENSG00000265808.4_3|SEC22B|distal|cryptic,.,-,SEC22B,exon,2.0,ENSG00000265808.4,ENSG00000265808.4_3
...,...,...,...,...,...,...,...,...,...,...,...
84,chr20,33675132,33676979,ENSG00000101412.13_2|E2F1|distal|cryptic,.,-,E2F1,exon,2.0,ENSG00000101412.13,ENSG00000101412.13_2
85,chr22,42681021,42693997,ENSG00000128274.17_2|A4GALT|distal|cryptic,.,-,A4GALT,exon,2.0,ENSG00000128274.17,ENSG00000128274.17_2
86,chr22,50621309,50625464,ENSG00000100299.18_2|ARSA|distal|cryptic,.,-,ARSA,exon,2.0,ENSG00000100299.18,ENSG00000100299.18_2
87,chrX,47631618,47636927,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,exon,2.0,ENSG00000126767.18,ENSG00000126767.18_2


In [64]:
# need to create 'gene', 'transcript' and 'exon' entries
# exon already present
grs = [papa_cryp_dist.assign("Feature", lambda df: pd.Series([feat]*len(df), index=df.index)) for feat in ["transcript", "gene"]]
# combine into single gr
papa_cryp_dists = pr.concat([papa_cryp_dist, *grs])
papa_cryp_dists.Feature.value_counts()


Feature
exon          89
transcript    89
gene          89
Name: count, dtype: int64

In [65]:
# repeat for unique regions
grs = [papa_cryp_dist_uniq.assign("Feature", lambda df: pd.Series([feat]*len(df), index=df.index)) for feat in ["transcript", "gene"]]
# combine into single gr
papa_cryp_dist_uniqs = pr.concat([papa_cryp_dist_uniq, *grs])
papa_cryp_dist_uniqs.Feature.value_counts()

Feature
exon          94
transcript    94
gene          94
Name: count, dtype: int64

In [69]:
# combine into single GTF
comb_gtf = pr.concat([papa_cryp_dist_uniqs, papa_cryp_dists]).sort()
comb_gtf = comb_gtf.apply(lambda df: df.rename(columns={"Name": "papa_bed_name"}))
comb_gtf.le_id = comb_gtf.transcript_id.str.removesuffix("__distal")
comb_gtf

Unnamed: 0,Chromosome,Start,End,papa_bed_name,Score,Strand,gene_name,Feature,pas_number,gene_id,transcript_id,le_id
0,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0,ENSG00000065526.12,ENSG00000065526.12_4,ENSG00000065526.12_4
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,transcript,2.0,ENSG00000065526.12,ENSG00000065526.12_4,ENSG00000065526.12_4
2,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,gene,2.0,ENSG00000065526.12,ENSG00000065526.12_4,ENSG00000065526.12_4
3,chr1,15940456,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0,ENSG00000065526.12,ENSG00000065526.12_4__distal,ENSG00000065526.12_4
4,chr1,15940456,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,transcript,2.0,ENSG00000065526.12,ENSG00000065526.12_4__distal,ENSG00000065526.12_4
...,...,...,...,...,...,...,...,...,...,...,...,...
544,chrX,123600550,123603748,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,transcript,2.0,ENSG00000125676.20,ENSG00000125676.20_2,ENSG00000125676.20_2
545,chrX,123600550,123603748,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,gene,2.0,ENSG00000125676.20,ENSG00000125676.20_2,ENSG00000125676.20_2
546,chrX,123601338,123603748,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,exon,2.0,ENSG00000125676.20,ENSG00000125676.20_2__distal,ENSG00000125676.20_2
547,chrX,123601338,123603748,ENSG00000125676.20_2|THOC2|proximal|cryptic,.,-,THOC2,transcript,2.0,ENSG00000125676.20,ENSG00000125676.20_2__distal,ENSG00000125676.20_2


In [70]:
comb_gtf.subset(lambda df: df.gene_name == "ELK1")

Unnamed: 0,Chromosome,Start,End,papa_bed_name,Score,Strand,gene_name,Feature,pas_number,gene_id,transcript_id,le_id
0,chrX,47631618,47635520,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,exon,2.0,ENSG00000126767.18,ENSG00000126767.18_2__distal,ENSG00000126767.18_2
1,chrX,47631618,47635520,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,transcript,2.0,ENSG00000126767.18,ENSG00000126767.18_2__distal,ENSG00000126767.18_2
2,chrX,47631618,47635520,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,gene,2.0,ENSG00000126767.18,ENSG00000126767.18_2__distal,ENSG00000126767.18_2
3,chrX,47631618,47636927,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,exon,2.0,ENSG00000126767.18,ENSG00000126767.18_2,ENSG00000126767.18_2
4,chrX,47631618,47636927,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,transcript,2.0,ENSG00000126767.18,ENSG00000126767.18_2,ENSG00000126767.18_2
5,chrX,47631618,47636927,ENSG00000126767.18_2|ELK1|distal|cryptic,.,-,ELK1,gene,2.0,ENSG00000126767.18,ENSG00000126767.18_2,ENSG00000126767.18_2


In [78]:
# need to make copies where 'transcript' and 'gene' Feature columns are created for eacxh interval
grs = [rfng_tlx1_dist.assign("Feature", lambda df: pd.Series([feat]*len(df), index=df.index)) for feat in ["transcript", "gene"]]
# combine into single gr
rfng_tlx1_dists = pr.concat([rfng_tlx1_dist, *grs])
grs = [rfng_tlx1_dist_uniq.assign("Feature", lambda df: pd.Series([feat]*len(df), index=df.index)) for feat in ["transcript", "gene"]]
rfng_tlx1_dist_uniqs = pr.concat([rfng_tlx1_dist_uniq, *grs])
rfng_tlx1_dist_uniqs.le_id = rfng_tlx1_dist_uniqs.le_id + "__distal"

rfng_tlx1_comb_gtf = pr.concat([rfng_tlx1_dists, rfng_tlx1_dist_uniqs]).sort()
rfng_tlx1_comb_gtf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id,pas_number
0,chr10,.,exon,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2,2.0
1,chr10,.,transcript,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2,2.0
2,chr10,.,gene,101136690,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2,2.0
3,chr10,.,exon,101137789,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2__distal,2.0
4,chr10,.,transcript,101137789,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2__distal,2.0
5,chr10,.,gene,101137789,101138531,.,+,.,PAPA.doxconc_DOX_0075_1.10465,,...,,,,,last_exon_extension,ENSG00000107807.13,TLX1,2.0,ENSG00000107807.13_2__distal,2.0
6,chr17,.,exon,82045150,82047901,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1__distal,2.0
7,chr17,.,transcript,82045150,82047901,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1__distal,2.0
8,chr17,.,gene,82045150,82047901,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1__distal,2.0
9,chr17,.,exon,82045150,82048807,.,-,.,PAPA.TDP43-F_S6.17048,,...,,,,,last_exon_spliced,ENSG00000169733.12,RFNG,1.0,ENSG00000169733.12_1,2.0


In [79]:
rfng_tlx1_comb_gtf = rfng_tlx1_comb_gtf[["Feature", "ref_gene_name", "ref_gene_id", "le_id", "pas_number"]]
rfng_tlx1_comb_gtf = rfng_tlx1_comb_gtf.apply(lambda df: df.rename(columns={"ref_gene_id": "gene_id", "ref_gene_name": "gene_name"}))
rfng_tlx1_comb_gtf.transcript_id = rfng_tlx1_comb_gtf.le_id
rfng_tlx1_comb_gtf.le_id = rfng_tlx1_comb_gtf.le_id.str.removesuffix("__distal")
rfng_tlx1_comb_gtf

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_name,gene_id,le_id,pas_number,transcript_id
0,chr10,exon,101136690,101138531,+,TLX1,ENSG00000107807.13,ENSG00000107807.13_2,2.0,ENSG00000107807.13_2
1,chr10,transcript,101136690,101138531,+,TLX1,ENSG00000107807.13,ENSG00000107807.13_2,2.0,ENSG00000107807.13_2
2,chr10,gene,101136690,101138531,+,TLX1,ENSG00000107807.13,ENSG00000107807.13_2,2.0,ENSG00000107807.13_2
3,chr10,exon,101137789,101138531,+,TLX1,ENSG00000107807.13,ENSG00000107807.13_2,2.0,ENSG00000107807.13_2__distal
4,chr10,transcript,101137789,101138531,+,TLX1,ENSG00000107807.13,ENSG00000107807.13_2,2.0,ENSG00000107807.13_2__distal
5,chr10,gene,101137789,101138531,+,TLX1,ENSG00000107807.13,ENSG00000107807.13_2,2.0,ENSG00000107807.13_2__distal
6,chr17,exon,82045150,82047901,-,RFNG,ENSG00000169733.12,ENSG00000169733.12_1,2.0,ENSG00000169733.12_1__distal
7,chr17,transcript,82045150,82047901,-,RFNG,ENSG00000169733.12,ENSG00000169733.12_1,2.0,ENSG00000169733.12_1__distal
8,chr17,gene,82045150,82047901,-,RFNG,ENSG00000169733.12,ENSG00000169733.12_1,2.0,ENSG00000169733.12_1__distal
9,chr17,exon,82045150,82048807,-,RFNG,ENSG00000169733.12,ENSG00000169733.12_1,2.0,ENSG00000169733.12_1


In [162]:
# add 
comb_gtf = pr.concat([rfng_tlx1_comb_gtf, comb_gtf, papa_gtf_ale_ipa_alt_m, papa_cryp_ale_ipa]).sort()
comb_gtf

Unnamed: 0,Chromosome,Start,End,papa_bed_name,Score,Strand,gene_name,Feature,pas_number,gene_id,transcript_id,le_id
0,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0,ENSG00000065526.12,ENSG00000065526.12_4,ENSG00000065526.12_4
1,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,transcript,2.0,ENSG00000065526.12,ENSG00000065526.12_4,ENSG00000065526.12_4
2,chr1,15939295,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,gene,2.0,ENSG00000065526.12,ENSG00000065526.12_4,ENSG00000065526.12_4
3,chr1,15940456,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,exon,2.0,ENSG00000065526.12,ENSG00000065526.12_4__distal,ENSG00000065526.12_4
4,chr1,15940456,15940477,ENSG00000065526.12_4|SPEN|distal|cryptic,.,+,SPEN,transcript,2.0,ENSG00000065526.12,ENSG00000065526.12_4__distal,ENSG00000065526.12_4
...,...,...,...,...,...,...,...,...,...,...,...,...
1882,chrX,131688563,131692636,,,-,FIRRE,exon,4.0,ENSG00000213468.7,ENSG00000213468.7_4,ENSG00000213468.7_4
1883,chrX,131688563,131692636,,,-,FIRRE,transcript,4.0,ENSG00000213468.7,ENSG00000213468.7_4,ENSG00000213468.7_4
1884,chrX,131823775,131825221,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,.,-,FIRRE,transcript,1.0,ENSG00000213468.7,ENSG00000213468.7_1,ENSG00000213468.7_1
1885,chrX,131823775,131825221,ENSG00000213468.7_1|FIRRE|bleedthrough|cryptic,.,-,FIRRE,exon,1.0,ENSG00000213468.7,ENSG00000213468.7_1,ENSG00000213468.7_1


In [163]:
# double check that all cryptics have multiple PAS assigned
comb_gtf.as_df().groupby("gene_id")["transcript_id"].nunique().value_counts()

transcript_id
2    185
4      5
1      4
Name: count, dtype: int64

In [168]:
single_gene_ids = set(comb_gtf.as_df().groupby("gene_id")["transcript_id"].nunique().loc[lambda x: x == 1].index)
"NUTM2D"
# comb_gtf.subset(lambda df: df.gene_id.isin(single_gene_ids))
papa_cryp_bed.assign("gene_id", lambda df: df.Name.str.split("\\|", expand=True)[0].str.split("_", expand=True)[0]).subset(lambda df: df.gene_id.isin(single_gene_ids))

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_id
0,chr10,87365805,87372223,ENSG00000214562.15_1|NUTM2D|proximal|cryptic,.,+,ENSG00000214562.15
1,chr10,87366138,87372223,ENSG00000214562.15_2|NUTM2D|distal|cryptic,.,+,ENSG00000214562.15
2,chr11,134502943,134505661,ENSG00000255545.8_2|B3GAT1-DT|spliced|cryptic,.,+,ENSG00000255545.8
3,chr14,31433709,31438140,"ENSG00000203546.7,ENSG00000129480.13_2|ENSG000...",.,-,"ENSG00000203546.7,ENSG00000129480.13"
4,chr17,30520021,30528331,ENSG00000108587.16_3|GOSR1|proximal|cryptic,.,+,ENSG00000108587.16
5,chr17,30522253,30528331,ENSG00000108587.16_4|GOSR1|distal|cryptic,.,+,ENSG00000108587.16


In [None]:
"NUTM2D"

In [169]:
comb_gtf.to_gtf("processed/isoform_specific/regions/papa_feature_counts.gtf")