ddd

In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from helpers import get_terminal_regions, _df_update_3p
import os


In [2]:
# BED file of proximal + distal PAS contained within genes (used to plot fig2c in Zeng et al. biorxiv)
zeng_bed = pr.read_bed("data/zeng_2024/zeng_pas_fig2c.bed")
zeng_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,629997,629998,MTND2P28,.,+
1,chr1,630367,630368,MTND2P28,.,+
2,chr1,854387,854388,LINC01128,.,+
3,chr1,859444,859445,LINC01128,.,+
4,chr1,1011462,1011463,ISG15,.,+
...,...,...,...,...,...,...
17897,chrY,18932449,18932450,TTTY14,.,-
17898,chrY,19691944,19691945,,.,-
17899,chrY,19692490,19692491,,.,-
17900,chrY,19703866,19703867,KDM5D,.,-


In [3]:
# annotate pas with unique id
zeng_bed = zeng_bed.assign("pas_id", lambda df: df.Chromosome.str.cat(df[["Start", "End", "Strand", "Name"]].astype(str), sep=":"))
zeng_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id
0,chr1,629997,629998,MTND2P28,.,+,chr1:629997:629998:+:MTND2P28
1,chr1,630367,630368,MTND2P28,.,+,chr1:630367:630368:+:MTND2P28
2,chr1,854387,854388,LINC01128,.,+,chr1:854387:854388:+:LINC01128
3,chr1,859444,859445,LINC01128,.,+,chr1:859444:859445:+:LINC01128
4,chr1,1011462,1011463,ISG15,.,+,chr1:1011462:1011463:+:ISG15
...,...,...,...,...,...,...,...
17897,chrY,18932449,18932450,TTTY14,.,-,chrY:18932449:18932450:-:TTTY14
17898,chrY,19691944,19691945,,.,-,chrY:19691944:19691945:-:nan
17899,chrY,19692490,19692491,,.,-,chrY:19692490:19692491:-:nan
17900,chrY,19703866,19703867,KDM5D,.,-,chrY:19703866:19703867:-:KDM5D


In [4]:
# read in Gencode GTF, extract exons
exons = pr.read_gtf("data/gencode.v40.annotation.gtf").subset(lambda df: df.Feature == "exon")
exons = exons[["Feature", "gene_id", "gene_name", "transcript_id", "exon_number", "transcript_type"]]
exons


Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,transcript_type
0,chr1,exon,11868,12227,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,1,processed_transcript
1,chr1,exon,12612,12721,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,2,processed_transcript
2,chr1,exon,13220,14409,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,3,processed_transcript
3,chr1,exon,12009,12057,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,1,transcribed_unprocessed_pseudogene
4,chr1,exon,12178,12227,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,2,transcribed_unprocessed_pseudogene
...,...,...,...,...,...,...,...,...,...,...
1573257,chrY,exon,57214349,57214397,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,1,unprocessed_pseudogene
1573258,chrY,exon,57213879,57213964,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,2,unprocessed_pseudogene
1573259,chrY,exon,57213525,57213602,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,3,unprocessed_pseudogene
1573260,chrY,exon,57213203,57213357,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,4,unprocessed_pseudogene


In [99]:
genes = exons.boundaries(group_by="gene_id")

In [103]:
# do 'NA' genes overlap with gencode annotated genes
pas_na_gene = zeng_bed.subset(lambda df: df.pas_id.str.contains("nan")).join(genes, how = "left")
pas_na_gene


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id,Start_b,End_b,Strand_b,gene_id
0,chr1,1066354,1066355,,.,+,chr1:1066354:1066355:+:nan,1059686,1069355,+,ENSG00000217801.11
1,chr1,1066455,1066456,,.,+,chr1:1066455:1066456:+:nan,1059686,1069355,+,ENSG00000217801.11
2,chr1,1575620,1575621,,.,+,chr1:1575620:1575621:+:nan,1574101,1577075,+,ENSG00000215014.5
3,chr1,1577070,1577071,,.,+,chr1:1577070:1577071:+:nan,1574101,1577075,+,ENSG00000215014.5
4,chr1,1728507,1728508,,.,+,chr1:1728507:1728508:+:nan,1702735,1737688,-,ENSG00000268575.1
...,...,...,...,...,...,...,...,...,...,...,...
1376,chrY,12326385,12326386,,.,-,chrY:12326385:12326386:-:nan,12088009,12421587,-,ENSG00000206159.12
1377,chrY,12378537,12378538,,.,-,chrY:12378537:12378538:-:nan,12088009,12421587,-,ENSG00000206159.12
1378,chrY,12378537,12378538,,.,-,chrY:12378537:12378538:-:nan,12354095,12387328,+,ENSG00000225117.2
1379,chrY,19691944,19691945,,.,-,chrY:19691944:19691945:-:nan,19691940,19694606,-,ENSG00000260197.1


In [106]:
pas_na_gene.assign("gene_overlap", lambda df: df.Start_b.ne(-1)).gene_overlap.value_counts()

gene_overlap
True     1353
False      28
Name: count, dtype: int64

In [5]:
# subset to last exons of all annotated transcripts
exons = exons.assign("exon_number", lambda df: df.exon_number.astype(float).astype(int))
print(exons.exon_number.dtypes)
exons

int64


Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,transcript_type
0,chr1,exon,11868,12227,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,1,processed_transcript
1,chr1,exon,12612,12721,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,2,processed_transcript
2,chr1,exon,13220,14409,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,3,processed_transcript
3,chr1,exon,12009,12057,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,1,transcribed_unprocessed_pseudogene
4,chr1,exon,12178,12227,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,2,transcribed_unprocessed_pseudogene
...,...,...,...,...,...,...,...,...,...,...
1573257,chrY,exon,57214349,57214397,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,1,unprocessed_pseudogene
1573258,chrY,exon,57213879,57213964,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,2,unprocessed_pseudogene
1573259,chrY,exon,57213525,57213602,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,3,unprocessed_pseudogene
1573260,chrY,exon,57213203,57213357,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,4,unprocessed_pseudogene


In [6]:
# extract last exons for all annotated transcripts
last_exons = get_terminal_regions(exons, number_type="stranded")

# assign an 'exon id' - basically just the coordinates (so can handle identical LEs x transcripts of same gene)
last_exons = last_exons.assign("exon_id", lambda df: df.Chromosome.str.cat(df[["Start", "End", "Strand", "gene_name"]].astype(str), sep=":"))

# don't need it
last_exons = last_exons.drop("exon_number")

# annotate overlapping last exons with a common ID
last_exons = last_exons.cluster(strand=True)

last_exons

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,transcript_type,exon_id,Cluster
0,chr1,exon,13220,14409,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,processed_transcript,chr1:13220:14409:+:DDX11L1,1
1,chr1,exon,13452,13670,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,transcribed_unprocessed_pseudogene,chr1:13452:13670:+:DDX11L1,1
2,chr1,exon,30365,30503,+,ENSG00000284332.1,MIR1302-2,ENST00000607096.1,miRNA,chr1:30365:30503:+:MIR1302-2,2
3,chr1,exon,30975,31097,+,ENSG00000243485.5,MIR1302-2HG,ENST00000473358.1,lncRNA,chr1:30975:31097:+:MIR1302-2HG,3
4,chr1,exon,30975,31109,+,ENSG00000243485.5,MIR1302-2HG,ENST00000469289.1,lncRNA,chr1:30975:31109:+:MIR1302-2HG,3
...,...,...,...,...,...,...,...,...,...,...,...
246619,chrY,exon,57015104,57016096,-,ENSG00000237801.6_PAR_Y,AMD1P2,ENST00000412936.6_PAR_Y,processed_pseudogene,chrY:57015104:57016096:-:AMD1P2,119470
246620,chrY,exon,57165511,57165845,-,ENSG00000228410.6_PAR_Y,ELOCP24,ENST00000456370.6_PAR_Y,processed_pseudogene,chrY:57165511:57165845:-:ELOCP24,119471
246621,chrY,exon,57171889,57172769,-,ENSG00000223484.7_PAR_Y,TRPC6P,ENST00000421233.6_PAR_Y,processed_pseudogene,chrY:57171889:57172769:-:TRPC6P,119472
246622,chrY,exon,57201142,57202020,-,ENSG00000185203.12_PAR_Y,WASIR1,ENST00000399966.9_PAR_Y,lncRNA,chrY:57201142:57202020:-:WASIR1,119473


In [31]:
# track input PAS
pas_ids = set(zeng_bed.pas_id)
len(pas_ids)

17902

In [7]:
# Assign PAS to last exons based on overlap
pas_le = last_exons[["exon_id", "Cluster"]].join(zeng_bed[["Score", "pas_id"]], strandedness="same",)
pas_le

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id
0,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,629997,629998,.,+,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,630367,630368,.,+,chr1:630367:630368:+:MTND2P28
2,chr1,853390,854398,+,chr1:853390:854398:+:LINC01128,35,854387,854388,.,+,chr1:854387:854388:+:LINC01128
3,chr1,853390,859446,+,chr1:853390:859446:+:LINC01128,35,854387,854388,.,+,chr1:854387:854388:+:LINC01128
4,chr1,853390,859446,+,chr1:853390:859446:+:LINC01128,35,859444,859445,.,+,chr1:859444:859445:+:LINC01128
...,...,...,...,...,...,...,...,...,...,...,...
38045,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19705419,19705420,.,-,chrY:19705419:19705420:-:KDM5D
38046,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703866,19703867,.,-,chrY:19703866:19703867:-:KDM5D
38047,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19705419,19705420,.,-,chrY:19705419:19705420:-:KDM5D
38048,chrY,19705416,19706345,-,chrY:19705416:19706345:-:KDM5D,119349,19705419,19705420,.,-,chrY:19705419:19705420:-:KDM5D


In [8]:
# Count the number of unique PAS overlapping last exons 
# (2 = 3'UTR PAS, 1 = ALEs)
num_olap_pas = pas_le.as_df().groupby("exon_id")["pas_id"].nunique().reset_index()

# number of putative 3'UTR APA / ALE PAS
num_olap_pas.pas_id.value_counts()

pas_id
1    13557
2     6923
3        2
Name: count, dtype: int64

1 overlap could arise due to a short annotated ALE (but have a longer annotated ALE of the same gene overlap it)

At gene level, check if any last exon has two overlapping PAS, and recalculate the frequency distribution

In [9]:
# extract gene name from exon_id
num_olap_pas.loc[:, "gene_name"] = num_olap_pas.exon_id.str.split(":", expand=True, regex=False)[4]

# for each gene, pick the max number of pas (pas_id column) and recount distribution (i.e. does gene have at least one tandem APA last exon)
num_olap_pas.groupby("gene_name")["pas_id"].max().value_counts()


pas_id
2    4877
1    3549
3       2
Name: count, dtype: int64

Tandem APA last exons are now much more frequent, as expected

To define tandem APA last exons, will need to subset for exon IDs with two overlapping PAS. Will keep last exons with distinct 5'ends (but still 2 overlapping) for now, as can't see a reasonable justification for removing/prioritising.

1. Get list/set of exon IDs with two overlapping PAS, subset pas_le for these events
2. Update the 3' coordinate of last exons (strand aware) to the reported PAS.
3. Drop duplicate intervals (arise from identical last exons/extended 3'ends with same 5'end), get BED ready (i.e. define a cleaned 'Name' field)

In [10]:
# get last exons with two overlapping PAS & subset for joined last exon + pas coordinates
tandem_exons_ids = set(num_olap_pas.loc[num_olap_pas["pas_id"] == 2, "exon_id"])

pas_le_3utr = pas_le.subset(lambda df: df.exon_id.isin(tandem_exons_ids))
pas_le_3utr

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id
0,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,629997,629998,.,+,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,630367,630368,.,+,chr1:630367:630368:+:MTND2P28
2,chr1,853390,859446,+,chr1:853390:859446:+:LINC01128,35,854387,854388,.,+,chr1:854387:854388:+:LINC01128
3,chr1,853390,859446,+,chr1:853390:859446:+:LINC01128,35,859444,859445,.,+,chr1:859444:859445:+:LINC01128
4,chr1,1065829,1066459,+,chr1:1065829:1066459:+:ENSG00000217801,57,1066354,1066355,.,+,chr1:1066354:1066355:+:nan
...,...,...,...,...,...,...,...,...,...,...,...
18689,chrY,19691940,19694606,-,chrY:19691940:19694606:-:ENSG00000260197,119348,19692490,19692491,.,-,chrY:19692490:19692491:-:nan
18690,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703866,19703867,.,-,chrY:19703866:19703867:-:KDM5D
18691,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19705419,19705420,.,-,chrY:19705419:19705420:-:KDM5D
18692,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703866,19703867,.,-,chrY:19703866:19703867:-:KDM5D


In [11]:
# update 3' exon coordinates to the PAS coordinate
pas_le_3utr_upd = pas_le_3utr.apply(lambda df: _df_update_3p(df))
pas_le_3utr_upd

Number of negative or zero-length updated intervals to be dropped - 940
Number of negative or zero-length updated intervals to be dropped - 1030
Number of negative or zero-length updated intervals to be dropped - 728
Number of negative or zero-length updated intervals to be dropped - 612
Number of negative or zero-length updated intervals to be dropped - 552
Number of negative or zero-length updated intervals to be dropped - 584
Number of negative or zero-length updated intervals to be dropped - 384
Number of negative or zero-length updated intervals to be dropped - 442
Number of negative or zero-length updated intervals to be dropped - 452
Number of negative or zero-length updated intervals to be dropped - 514
Number of negative or zero-length updated intervals to be dropped - 408
Number of negative or zero-length updated intervals to be dropped - 600
Number of negative or zero-length updated intervals to be dropped - 408
Number of negative or zero-length updated intervals to be dropp

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id
0,chr1,629639,629998,+,chr1:629639:630683:+:MTND2P28,16,629997,630683,.,+,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630368,+,chr1:629639:630683:+:MTND2P28,16,630367,630683,.,+,chr1:630367:630368:+:MTND2P28
2,chr1,853390,854388,+,chr1:853390:859446:+:LINC01128,35,854387,859446,.,+,chr1:854387:854388:+:LINC01128
3,chr1,853390,859445,+,chr1:853390:859446:+:LINC01128,35,859444,859446,.,+,chr1:859444:859445:+:LINC01128
4,chr1,1065829,1066355,+,chr1:1065829:1066459:+:ENSG00000217801,57,1066354,1066459,.,+,chr1:1066354:1066355:+:nan
...,...,...,...,...,...,...,...,...,...,...,...
18689,chrY,19692490,19694606,-,chrY:19691940:19694606:-:ENSG00000260197,119348,19691940,19692491,.,-,chrY:19692490:19692491:-:nan
18690,chrY,19703866,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703864,19703867,.,-,chrY:19703866:19703867:-:KDM5D
18691,chrY,19705419,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703864,19705420,.,-,chrY:19705419:19705420:-:KDM5D
18692,chrY,19703866,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703864,19703867,.,-,chrY:19703866:19703867:-:KDM5D


In [12]:
# clean up

# drop duplicates
pas_le_3utr_upd = pas_le_3utr_upd.drop_duplicate_positions(strand=True)

# remake pas_id, using gene name from exon_id if required
pas_le_3utr_upd = pas_le_3utr_upd.as_df()

# generate standardised gene name column, preferring name from exon_id where provided PAS has no gene name
pas_le_3utr_upd.loc[:, "pas_gn"] = pas_le_3utr_upd.pas_id.str.split(":", expand=True, regex=False)[4]
pas_le_3utr_upd.loc[:, "exon_gn"] = pas_le_3utr_upd.exon_id.str.split(":", expand=True, regex=False)[4]
pas_le_3utr_upd.loc[:, "gene_name"] = np.where(pas_le_3utr_upd.pas_gn == "nan", pas_le_3utr_upd["exon_gn"], pas_le_3utr_upd["pas_gn"])

# Update pas_id to include the standardised gene name
pas_le_3utr_upd.loc[:, "pas_id"] = pas_le_3utr_upd.pas_id.str.split(':').str[:4].str.join(':')
pas_le_3utr_upd.loc[:, "pas_id"] = pas_le_3utr_upd.pas_id.str.cat(pas_le_3utr_upd.gene_name, sep=":")
pas_le_3utr_upd.rename(columns={"pas_id": "Name"}, inplace=True)

pas_le_3utr_upd = pr.PyRanges(pas_le_3utr_upd)

pas_le_3utr_upd[["Score", "Name"]]

Unnamed: 0,Chromosome,Start,End,Strand,Score,Name
0,chr1,629639,629998,+,.,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630368,+,.,chr1:630367:630368:+:MTND2P28
2,chr1,853390,854388,+,.,chr1:854387:854388:+:LINC01128
3,chr1,853390,859445,+,.,chr1:859444:859445:+:LINC01128
4,chr1,1065829,1066355,+,.,chr1:1066354:1066355:+:ENSG00000217801
...,...,...,...,...,...,...
10373,chrY,2934524,2934771,-,.,chrY:2934524:2934525:-:ENSG00000278847
10374,chrY,19691944,19694606,-,.,chrY:19691944:19691945:-:ENSG00000260197
10375,chrY,19692490,19694606,-,.,chrY:19692490:19692491:-:ENSG00000260197
10376,chrY,19703866,19706345,-,.,chrY:19703866:19703867:-:KDM5D


# ALEs...

Need to identify genes where have only 1 PAS overlapping each ALE.
Then using those genes, check that the last exons themselves are not overlapping, and that the PAS are distinct (may only be one / 2 pas for gene overlapping an annotated exon - need to track those)

In [13]:
# for each gene, get max overlapping PAS count of any of its last exons
max_gene_olap_pas =  num_olap_pas.groupby("gene_name")["pas_id"].max()

# subset to genes with overlap PAS count of 1, becasue these do not have any tandem APA
put_ale_genes = set(max_gene_olap_pas[max_gene_olap_pas.eq(1)].index)
# get exon ids
put_ale_ids = set(num_olap_pas.loc[num_olap_pas.gene_name.isin(put_ale_genes), "exon_id"])

pas_le_ale = pas_le.subset(lambda df: df.exon_id.isin(put_ale_ids))
pas_le_ale

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id
0,chr1,1013983,1014540,+,chr1:1013983:1014540:+:ISG15,51,1014537,1014538,.,+,chr1:1014537:1014538:+:ISG15
1,chr1,1013983,1014540,+,chr1:1013983:1014540:+:ISG15,51,1014537,1014538,.,+,chr1:1014537:1014538:+:ISG15
2,chr1,1616507,1617898,+,chr1:1616507:1617898:+:MIB2,100,1617320,1617321,.,+,chr1:1617320:1617321:+:MIB2
3,chr1,1616507,1619210,+,chr1:1616507:1619210:+:MIB2,100,1617320,1617321,.,+,chr1:1617320:1617321:+:MIB2
4,chr1,1617077,1617323,+,chr1:1617077:1617323:+:MIB2,100,1617320,1617321,.,+,chr1:1617320:1617321:+:MIB2
...,...,...,...,...,...,...,...,...,...,...,...
12049,chrY,18930470,18932841,-,chrY:18930470:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14
12050,chrY,18932006,18932841,-,chrY:18932006:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14
12051,chrY,18932316,18932841,-,chrY:18932316:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14
12052,chrY,18932435,18932841,-,chrY:18932435:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14


In [14]:
# ALEs should be non-overlapping. Each gene should therefore have two unique PAS, and two unique Cluster values
pas_le_ale = pas_le_ale.assign("gene_name", lambda df: df.exon_id.str.split(":", expand=True)[4])
# count unqiue pas + non-overlapping ales
ale_uniq_counts = pas_le_ale.as_df().groupby("gene_name")[["Cluster", "pas_id"]].nunique().reset_index()

# quick check counts
ale_uniq_counts[["Cluster","pas_id"]].value_counts()

Cluster  pas_id
1        1         2621
2        2          900
1        2           27
3        3            1
Name: count, dtype: int64

Would anticipate 2 + 2 being the bonafide ALEs.
1 + 1 likely mean 2nd PAS doesn't overlap with known last exon boundaries (could be intronic PAS, or just downstream of annotated ALEs?). Would have to better parse

In [15]:
# check the 1 ALE but two PAS events
pas_le_ale.subset(lambda df: df.gene_name.isin(set(ale_uniq_counts.loc[(ale_uniq_counts["Cluster"] == 1) & (ale_uniq_counts["pas_id"] == 2), "gene_name"]))).head(22)

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id,gene_name
0,chr1,42674847,42676446,+,chr1:42674847:42676446:+:PPIH,1320,42676040,42676041,.,+,chr1:42676040:42676041:+:PPIH,PPIH
1,chr1,42675287,42676713,+,chr1:42675287:42676713:+:PPIH,1320,42676040,42676041,.,+,chr1:42676040:42676041:+:PPIH,PPIH
2,chr1,42675366,42676043,+,chr1:42675366:42676043:+:PPIH,1320,42676040,42676041,.,+,chr1:42676040:42676041:+:PPIH,PPIH
3,chr1,42675366,42676619,+,chr1:42675366:42676619:+:PPIH,1320,42676040,42676041,.,+,chr1:42676040:42676041:+:PPIH,PPIH
4,chr1,42675732,42676202,+,chr1:42675732:42676202:+:PPIH,1320,42676040,42676041,.,+,chr1:42676040:42676041:+:PPIH,PPIH
5,chr1,42675737,42676043,+,chr1:42675737:42676043:+:PPIH,1320,42676040,42676041,.,+,chr1:42676040:42676041:+:PPIH,PPIH
6,chr1,42676583,42676758,+,chr1:42676583:42676758:+:PPIH,1320,42676757,42676758,.,+,chr1:42676757:42676758:+:PPIH,PPIH
7,chr1,42676583,42676758,+,chr1:42676583:42676758:+:PPIH,1320,42676757,42676758,.,+,chr1:42676757:42676758:+:PPIH,PPIH
8,chr1,42676583,42676758,+,chr1:42676583:42676758:+:PPIH,1320,42676757,42676758,.,+,chr1:42676757:42676758:+:PPIH,PPIH
9,chr1,44777999,44778663,+,chr1:44777999:44778663:+:RPS8,1468,44778053,44778054,.,+,chr1:44778053:44778054:+:RPS8,RPS8


Partly overlapping ALEs with different 5'end coordinates. Probably better described as tandem ALEs, just need to merge the exons before updating the PAS?



In [16]:
# is SFPQ represented in putative ALEs?
ale_uniq_counts.loc[(ale_uniq_counts["Cluster"] == 2) & (ale_uniq_counts["pas_id"] == 2) & (ale_uniq_counts["gene_name"] == "SFPQ"),:]

Unnamed: 0,gene_name,Cluster,pas_id
2681,SFPQ,2,2


In [17]:
# get last exons with two overlapping PAS & subset for joined last exon + pas coordinates
# get IDs for 'bonafide' ALEs
ale_gene_names = ale_uniq_counts.loc[(ale_uniq_counts["Cluster"] == 2) & (ale_uniq_counts["pas_id"] == 2), "gene_name"]
pas_le_ale_bf = pas_le_ale.subset(lambda df: df.gene_name.isin(ale_gene_names))
pas_le_ale_bf


Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id,gene_name
0,chr1,1616507,1617898,+,chr1:1616507:1617898:+:MIB2,100,1617320,1617321,.,+,chr1:1617320:1617321:+:MIB2,MIB2
1,chr1,1616507,1619210,+,chr1:1616507:1619210:+:MIB2,100,1617320,1617321,.,+,chr1:1617320:1617321:+:MIB2,MIB2
2,chr1,1617077,1617323,+,chr1:1617077:1617323:+:MIB2,100,1617320,1617321,.,+,chr1:1617320:1617321:+:MIB2,MIB2
3,chr1,1630291,1630605,+,chr1:1630291:1630605:+:MIB2,109,1630604,1630605,.,+,chr1:1630604:1630605:+:MIB2,MIB2
4,chr1,1630291,1630605,+,chr1:1630291:1630605:+:MIB2,109,1630604,1630605,.,+,chr1:1630604:1630605:+:MIB2,MIB2
...,...,...,...,...,...,...,...,...,...,...,...,...
4523,chrY,18930470,18932841,-,chrY:18930470:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14,TTTY14
4524,chrY,18932006,18932841,-,chrY:18932006:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14,TTTY14
4525,chrY,18932316,18932841,-,chrY:18932316:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14,TTTY14
4526,chrY,18932435,18932841,-,chrY:18932435:18932841:-:TTTY14,119340,18932449,18932450,.,-,chrY:18932449:18932450:-:TTTY14,TTTY14


In [18]:
# update 3' exon coordinates to the PAS coordinate
pas_le_ale_bf_upd = pas_le_ale_bf.apply(lambda df: _df_update_3p(df))
pas_le_ale_bf_upd = pas_le_ale_bf_upd.drop_duplicate_positions(strand=True)
pas_le_ale_bf_upd

Number of negative or zero-length updated intervals to be dropped - 177
Number of negative or zero-length updated intervals to be dropped - 217
Number of negative or zero-length updated intervals to be dropped - 237
Number of negative or zero-length updated intervals to be dropped - 99
Number of negative or zero-length updated intervals to be dropped - 125
Number of negative or zero-length updated intervals to be dropped - 82
Number of negative or zero-length updated intervals to be dropped - 66
Number of negative or zero-length updated intervals to be dropped - 107
Number of negative or zero-length updated intervals to be dropped - 83
Number of negative or zero-length updated intervals to be dropped - 48
Number of negative or zero-length updated intervals to be dropped - 104
Number of negative or zero-length updated intervals to be dropped - 136
Number of negative or zero-length updated intervals to be dropped - 119
Number of negative or zero-length updated intervals to be dropped - 1

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Score,Strand_b,pas_id,gene_name
0,chr1,1616507,1617321,+,chr1:1616507:1617898:+:MIB2,100,1617320,1617898,.,+,chr1:1617320:1617321:+:MIB2,MIB2
1,chr1,1617077,1617321,+,chr1:1617077:1617323:+:MIB2,100,1617320,1617323,.,+,chr1:1617320:1617321:+:MIB2,MIB2
2,chr1,1630291,1630605,+,chr1:1630291:1630605:+:MIB2,109,1630604,1630605,.,+,chr1:1630604:1630605:+:MIB2,MIB2
3,chr1,6632795,6633562,+,chr1:6632795:6633562:+:THAP3,248,6633561,6633562,.,+,chr1:6633561:6633562:+:THAP3,THAP3
4,chr1,6634019,6635586,+,chr1:6634019:6635586:+:THAP3,249,6635585,6635586,.,+,chr1:6635585:6635586:+:THAP3,THAP3
...,...,...,...,...,...,...,...,...,...,...,...,...
2046,chrY,19593087,19594158,+,chrY:19593087:19594161:+:TXLNGY,118984,19594157,19594161,.,+,chrY:19594157:19594158:+:nan,TXLNGY
2047,chrY,13234577,13234826,-,chrY:13234576:13234826:-:UTY,119277,13234576,13234578,.,-,chrY:13234577:13234578:-:UTY,UTY
2048,chrY,13248385,13249882,-,chrY:13248378:13249882:-:UTY,119278,13248378,13248386,.,-,chrY:13248385:13248386:-:UTY,UTY
2049,chrY,18872593,18872834,-,chrY:18872500:18872834:-:TTTY14,119337,18872500,18872594,.,-,chrY:18872593:18872594:-:TTTY14,TTTY14


In [19]:
# remake pas_id, using gene name from exon_id if required
pas_le_ale_bf_upd = pas_le_ale_bf_upd.drop("gene_name").as_df()

# generate standardised gene name column, preferring name from exon_id where provided PAS has no gene name
pas_le_ale_bf_upd.loc[:, "pas_gn"] = pas_le_ale_bf_upd.pas_id.str.split(":", expand=True, regex=False)[4]
pas_le_ale_bf_upd.loc[:, "exon_gn"] = pas_le_ale_bf_upd.exon_id.str.split(":", expand=True, regex=False)[4]
pas_le_ale_bf_upd.loc[:, "gene_name"] = np.where(pas_le_ale_bf_upd.pas_gn == "nan", pas_le_ale_bf_upd["exon_gn"], pas_le_ale_bf_upd["pas_gn"])

# Update pas_id to include the standardised gene name
pas_le_ale_bf_upd.loc[:, "pas_id"] = pas_le_ale_bf_upd.pas_id.str.split(':').str[:4].str.join(':')
pas_le_ale_bf_upd.loc[:, "pas_id"] = pas_le_ale_bf_upd.pas_id.str.cat(pas_le_ale_bf_upd.gene_name, sep=":")
pas_le_ale_bf_upd.rename(columns={"pas_id": "Name"}, inplace=True)

pas_le_ale_bf_upd = pr.PyRanges(pas_le_ale_bf_upd)

pas_le_ale_bf_upd[["Score", "Name"]]

Unnamed: 0,Chromosome,Start,End,Strand,Score,Name
0,chr1,1616507,1617321,+,.,chr1:1617320:1617321:+:MIB2
1,chr1,1617077,1617321,+,.,chr1:1617320:1617321:+:MIB2
2,chr1,1630291,1630605,+,.,chr1:1630604:1630605:+:MIB2
3,chr1,6632795,6633562,+,.,chr1:6633561:6633562:+:THAP3
4,chr1,6634019,6635586,+,.,chr1:6635585:6635586:+:THAP3
...,...,...,...,...,...,...
2046,chrY,19593087,19594158,+,.,chrY:19594157:19594158:+:TXLNGY
2047,chrY,13234577,13234826,-,.,chrY:13234577:13234578:-:UTY
2048,chrY,13248385,13249882,-,.,chrY:13248385:13248386:-:UTY
2049,chrY,18872593,18872834,-,.,chrY:18872593:18872594:-:TTTY14


In [20]:
pas_le_ale_bf_upd[["Score", "Name"]].subset(lambda df: df.Name.str.contains("SFPQ", regex=False))

Unnamed: 0,Chromosome,Start,End,Strand,Score,Name
0,chr1,35176380,35176474,-,.,chr1:35176380:35176381:-:SFPQ
1,chr1,35176380,35176471,-,.,chr1:35176380:35176381:-:SFPQ
2,chr1,35183601,35184593,-,.,chr1:35183601:35183602:-:SFPQ


In [21]:
# How many of Zeng et al. targets are missing?
with open("data/zeng_2024/zeng_target_genes.txt", "r") as infile:
    zeng_target_genes = set([line.rstrip("\n").replace("'","") for line in infile])

print(zeng_target_genes)

{'TMED10', 'GOLGA7B', 'UBE2R2', 'SLC6A8', 'SLC24A3', 'PAK4', 'TTBK1', 'SMC1A', 'ELP3', 'JPT1', 'TFDP2', 'TMEM106B', 'F11R', 'BRD9', 'MARK3', 'EIF4E2', 'KLHL42', 'ARHGAP32', 'SLC19A1', 'NDRG4', 'EMC10', 'FOXK2', 'SSR1', 'CCNL2', 'SLIT3', 'STMN2', 'NEFL', 'GSTO2', 'DPYSL5', 'ELAVL4', 'ZNF462', 'NUFIP2', 'TLX1', 'GPR173', 'SEC14L1', 'DYNC1LI1', 'MDGA1', 'ZMAT2', 'TARDBP', 'SMARCA4', 'PGRMC2', 'RAB11A', 'AZIN1', 'AGPAT4', 'CNPY3', 'SREK1', 'CSNK2A1', 'RFNG', 'G3BP1', 'HNRNPC', 'SSU72', 'LRRC3', 'EGFR', 'RPN1', 'ARNT', 'H3-3B', 'CORO1C', 'TAFA5', 'NUCKS1', 'SYP', 'PTPN9', 'ELP6', 'NDUFA9', 'NAV1', 'PARD6G', 'NSMAF', 'ELP1', 'OLA1', 'SIX3', 'CHRNB4', 'LSM14A', 'KPNA4', 'TEF', 'SFPQ', 'PPM1A', 'SMG7', 'DYRK2', 'NFE2L1', 'GREB1', 'ABCC5', 'HIF1AN', 'TFDP1', 'DIDO1', 'CADM1', 'CLSTN3', 'GGA2', 'SERF2', 'ARMC10', 'UBE2H'}


In [22]:
# get events where have defined regions
defined_genes = set(pas_le_3utr_upd.gene_name).union(set(pas_le_ale_bf_upd.gene_name))
missing_genes = zeng_target_genes.difference(defined_genes)
missing_genes

{'ARHGAP32',
 'BRD9',
 'CHRNB4',
 'CLSTN3',
 'CNPY3',
 'EGFR',
 'ELP6',
 'GSTO2',
 'HNRNPC',
 'LSM14A',
 'NAV1',
 'NSMAF',
 'PAK4',
 'PARD6G',
 'RFNG',
 'SEC14L1',
 'SIX3',
 'SLC24A3',
 'SLIT3',
 'SMG7',
 'SREK1',
 'STMN2',
 'TAFA5',
 'TLX1',
 'TTBK1',
 'UBE2R2'}

Missing - class of 3'UTR extensions - TLX1, RFNG, SIX3, ELP6 (?). Others are cryptic ALEs/IPAs that I have in my hits (STMN2, CNPY3, ARHGAP32). Would wager remainin are intronic / 3'UTR extensions I miss (check in IGV)


In [27]:
zeng_ext = pr.read_bed("data/zeng_2024/supplementary_s5.cryptic_pas.3UTR_extension.bed")
zeng_intron = pr.read_bed("data/zeng_2024/supplementary_s5.cryptic_pas.intron.bed")
zeng_ext = zeng_ext.assign("gene_name", lambda df: df.Name.str.split("|", expand=True)[2])
zeng_intron = zeng_intron.assign("gene_name", lambda df: df.Name.str.split("|", expand=True)[2])
zeng_ext_gn = set(zeng_ext.gene_name)
zeng_intron_gn = set(zeng_intron.gene_name)
zeng_ext

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name
0,chr1,39007689,39007690,chr1:39007689:+|3'UTR extension|AKIRIN1|0.0461...,.,+,AKIRIN1
1,chr1,39766107,39766108,chr1:39766107:+|3'UTR extension|PPIE|0.04583|0...,.,+,PPIE
2,chr1,52349654,52349655,chr1:52349654:+|3'UTR extension|ZFYVE9|0.02687...,.,+,ZFYVE9
3,chr1,110203695,110203696,chr1:110203695:+|3'UTR extension|SLC6A17|0.008...,.,+,SLC6A17
4,chr1,150353337,150353338,chr1:150353337:+|3'UTR extension|PRPF3|0.01155...,.,+,PRPF3
...,...,...,...,...,...,...,...
157,chrX,47217646,47217647,chrX:47217646:+|3'UTR extension|UBA1|0.01779|0...,.,+,UBA1
158,chrX,77909737,77909738,chrX:77909737:+|3'UTR extension|COX7B|0.04784|...,.,+,COX7B
159,chrX,47631616,47631617,chrX:47631616:-|3'UTR extension|ELK1|0.01899|0...,.,-,ELK1
160,chrX,54441090,54441091,chrX:54441090:-|3'UTR extension|FGD1|0.05151|0...,.,-,FGD1


In [26]:
# 3'exts
missing_genes.intersection(zeng_ext_gn)

{'RFNG', 'SIX3'}

In [28]:
# intronic PAS
missing_genes.intersection(zeng_intron_gn)

{'EGFR', 'GSTO2', 'STMN2'}

In [30]:
# missing from provided PAS
missing_genes.difference(set(zeng_bed.Name))

{'RFNG', 'SIX3', 'TLX1'}

So basically pretty much all provided, just need to rescue from current approach...
1. How many are just downstream of known last exons? (regardless of intron vs 3'UTR)
2. How many don't have a known ALE?

first, need to just know how many of these missing PAS don't overlap known last exons. Some genes will have at least 1 PAS overlapping (2621 genes based on ALE analysis),

In [40]:
# get pas IDs that haven't formed part of tandem 3'UTR-APA / bonafide ALEs
missing_pas_ids = pas_ids.difference(set(pas_le_3utr_upd.Name).union(set(pas_le_ale_bf_upd.Name)))
print(len(missing_pas_ids))

# subset to missing PAS
zeng_bed_missing = zeng_bed.subset(lambda df: df.pas_id.isin(missing_pas_ids))
zeng_bed_missing

6694


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id
0,chr1,1011462,1011463,ISG15,.,+,chr1:1011462:1011463:+:ISG15
1,chr1,1014537,1014538,ISG15,.,+,chr1:1014537:1014538:+:ISG15
2,chr1,1066354,1066355,,.,+,chr1:1066354:1066355:+:nan
3,chr1,1066455,1066456,,.,+,chr1:1066455:1066456:+:nan
4,chr1,1575620,1575621,,.,+,chr1:1575620:1575621:+:nan
...,...,...,...,...,...,...,...
6689,chrY,2934524,2934525,,.,-,chrY:2934524:2934525:-:nan
6690,chrY,12326385,12326386,,.,-,chrY:12326385:12326386:-:nan
6691,chrY,12378537,12378538,,.,-,chrY:12378537:12378538:-:nan
6692,chrY,19691944,19691945,,.,-,chrY:19691944:19691945:-:nan


In [51]:
# calculate distance to nearest (upstream) last exon (allowing overlaps)
missing_pas_le_nr = zeng_bed_missing.nearest(last_exons, strandedness="same", overlap=True,how="upstream",)
print(missing_pas_le_nr.Distance.describe(percentiles=[i * 0.1 for i in range(0,12,2)]))
missing_pas_le_nr

count    6.694000e+03
mean     1.525460e+04
std      5.585132e+04
min      0.000000e+00
0%       0.000000e+00
20%      0.000000e+00
40%      0.000000e+00
50%      4.000000e+00
60%      1.108600e+03
80%      9.649800e+03
100%     1.123544e+06
max      1.123544e+06
Name: Distance, dtype: float64


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id,Feature,Start_b,End_b,Strand_b,gene_id,gene_name,transcript_id,transcript_type,exon_id,Cluster,Distance
0,chr1,1014537,1014538,ISG15,.,+,chr1:1014537:1014538:+:ISG15,exon,1013983,1014540,+,ENSG00000187608.10,ISG15,ENST00000624697.4,protein_coding,chr1:1013983:1014540:+:ISG15,51,0
1,chr1,1066354,1066355,,.,+,chr1:1066354:1066355:+:nan,exon,1065829,1066459,+,ENSG00000217801.11,ENSG00000217801,ENST00000394517.7,processed_transcript,chr1:1065829:1066459:+:ENSG00000217801,57,0
2,chr1,1066455,1066456,,.,+,chr1:1066455:1066456:+:nan,exon,1065829,1066459,+,ENSG00000217801.11,ENSG00000217801,ENST00000394517.7,processed_transcript,chr1:1065829:1066459:+:ENSG00000217801,57,0
3,chr1,1575620,1575621,,.,+,chr1:1575620:1575621:+:nan,exon,1574974,1577075,+,ENSG00000215014.5,ENSG00000215014,ENST00000366221.3,retained_intron,chr1:1574974:1577075:+:ENSG00000215014,98,0
4,chr1,1577070,1577071,,.,+,chr1:1577070:1577071:+:nan,exon,1574974,1577075,+,ENSG00000215014.5,ENSG00000215014,ENST00000366221.3,retained_intron,chr1:1574974:1577075:+:ENSG00000215014,98,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6689,chrY,2934524,2934525,,.,-,chrY:2934524:2934525:-:nan,exon,2934405,2934771,-,ENSG00000278847.1,ENSG00000278847,ENST00000611750.1,lncRNA,chrY:2934405:2934771:-:ENSG00000278847,119157,0
6690,chrY,12378537,12378538,,.,-,chrY:12378537:12378538:-:nan,exon,12378531,12378749,-,ENSG00000206159.12,GYG2P1,ENST00000382965.3,transcribed_unprocessed_pseudogene,chrY:12378531:12378749:-:GYG2P1,119266,0
6691,chrY,19691944,19691945,,.,-,chrY:19691944:19691945:-:nan,exon,19691940,19694606,-,ENSG00000260197.1,ENSG00000260197,ENST00000566193.1,lncRNA,chrY:19691940:19694606:-:ENSG00000260197,119348,0
6692,chrY,19692490,19692491,,.,-,chrY:19692490:19692491:-:nan,exon,19691940,19694606,-,ENSG00000260197.1,ENSG00000260197,ENST00000566193.1,lncRNA,chrY:19691940:19694606:-:ENSG00000260197,119348,0


In [58]:
# for each gene, report the min and max distance (i.e. how many genes are just missing annotated last exon for 1 PAS?)
dist_min_max = missing_pas_le_nr.as_df().groupby("gene_name")["Distance"].agg([min, max]).reset_index().sort_values(by=["min", "max"])
# gene counts for both overlap last exons (True, True), 1 overlaps last exon (True, False) and neither overlaps (False, False)
dist_min_max.drop(columns="gene_name").eq(0,axis="columns").value_counts()

min    max  
True   False    1713
False  False    1507
True   True     1055
Name: count, dtype: int64

1055 should've been captured by previous categories...

In [72]:
# assign genes to categories based on overlaps
dist_min_max.loc[:, "overlap_group"] = dist_min_max.eq(0,axis="columns").sum(axis="columns")
dist_min_max

Unnamed: 0,gene_name,min,max,overlap_group
3,AADAT,0,0,2
9,ABCA10,0,0,2
11,ABCA3,0,0,2
25,ACBD5,0,0,2
26,ACD,0,0,2
...,...,...,...,...
2174,LINC01435,731685,731685,1
1642,FECHP1,769519,769519,1
3190,RNU6-974P,799441,799441,1
3310,SCP2D1,867610,867610,1


In [81]:
# genes with 1 missing PAS - how far away are the non-distal PAS?
dist_min_max_1 = dist_min_max[dist_min_max["overlap_group"].eq(1)]
dist_min_max_1["max"].describe(percentiles=[i * 0.1 for i in range(0,12,2)])


count    3.220000e+03
mean     2.843969e+04
std      7.187899e+04
min      1.000000e+00
0%       1.000000e+00
20%      4.440000e+02
40%      3.001800e+03
50%      5.400500e+03
60%      8.860200e+03
80%      3.098040e+04
100%     1.123544e+06
max      1.123544e+06
Name: max, dtype: float64

In [85]:
# try different max distance thresholds (how many genes rescue?)
{str(cutoff): len(dist_min_max_1[dist_min_max_1["max"] <= cutoff]) for cutoff in [0,25,50,100,200,500,1000, 2500, 5000]}

{'0': 0,
 '25': 420,
 '50': 462,
 '100': 506,
 '200': 555,
 '500': 667,
 '1000': 817,
 '2500': 1183,
 '5000': 1555}

Suspect most of the provided ones here are 3'UTR extensions (1555 / 1713). Remaining possibly unannotated ALEs

How are Yi's missing target genes distributed among the overlap categories


In [89]:
print(dist_min_max[dist_min_max.gene_name.isin(missing_genes)]["overlap_group"].value_counts())
dist_min_max[dist_min_max.gene_name.isin(missing_genes)]

overlap_group
1    13
2     9
Name: count, dtype: int64


Unnamed: 0,gene_name,min,max,overlap_group
549,CHRNB4,0,0,2
853,EGFR,0,0,2
1809,GSTO2,0,0,2
2645,NSMAF,0,0,2
2718,PAK4,0,0,2
3386,SLC24A3,0,0,2
3435,SLIT3,0,0,2
3569,STMN2,0,0,2
3627,TAFA5,0,0,2
2541,NAV1,0,1,1


In [93]:
missing_pas_le_nr.subset(lambda df: df.gene_name.isin(["STMN2", "ELP6", "FECHP1", "NAV1"]))

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id,Feature,Start_b,End_b,Strand_b,gene_id,gene_name,transcript_id,transcript_type,exon_id,Cluster,Distance
0,chr1,201823342,201823343,NAV1,.,+,chr1:201823342:201823343:+:NAV1,exon,201819836,201826969,+,ENSG00000134369.16,NAV1,ENST00000367295.5,protein_coding,chr1:201819836:201826969:+:NAV1,4359,0
1,chr1,201826969,201826970,NAV1,.,+,chr1:201826969:201826970:+:NAV1,exon,201819836,201826969,+,ENSG00000134369.16,NAV1,ENST00000685211.1,protein_coding,chr1:201819836:201826969:+:NAV1,4359,1
2,chr3,35643372,35643373,ARPP21,.,+,chr3:35643372:35643373:+:ARPP21,exon,34872640,34873854,+,ENSG00000235534.1,FECHP1,ENST00000431896.1,processed_pseudogene,chr3:34872640:34873854:+:FECHP1,19881,769519
3,chr3,47495640,47495641,ELP6,.,-,chr3:47495640:47495641:-:ELP6,exon,47495639,47496197,-,ENSG00000163832.16,ELP6,ENST00000296149.9,protein_coding,chr3:47495639:47496197:-:ELP6,23737,0
4,chr3,47500537,47500538,ELP6,.,-,chr3:47500537:47500538:-:ELP6,exon,47501526,47501851,-,ENSG00000163832.16,ELP6,ENST00000461208.5,retained_intron,chr3:47501526:47501851:-:ELP6,23739,989
5,chr8,79666156,79666157,STMN2,.,+,chr8:79666156:79666157:+:STMN2,exon,79664814,79666158,+,ENSG00000104435.14,STMN2,ENST00000220876.12,protein_coding,chr8:79664814:79666158:+:STMN2,50603,0


In [96]:
zeng_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id
0,chr1,629997,629998,MTND2P28,.,+,chr1:629997:629998:+:MTND2P28
1,chr1,630367,630368,MTND2P28,.,+,chr1:630367:630368:+:MTND2P28
2,chr1,854387,854388,LINC01128,.,+,chr1:854387:854388:+:LINC01128
3,chr1,859444,859445,LINC01128,.,+,chr1:859444:859445:+:LINC01128
4,chr1,1011462,1011463,ISG15,.,+,chr1:1011462:1011463:+:ISG15
...,...,...,...,...,...,...,...
17897,chrY,18932449,18932450,TTTY14,.,-,chrY:18932449:18932450:-:TTTY14
17898,chrY,19691944,19691945,,.,-,chrY:19691944:19691945:-:nan
17899,chrY,19692490,19692491,,.,-,chrY:19692490:19692491:-:nan
17900,chrY,19703866,19703867,KDM5D,.,-,chrY:19703866:19703867:-:KDM5D


In [97]:
zeng_bed.subset(lambda df: df.Name == "STMN2")

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,pas_id
0,chr8,79617048,79617049,STMN2,.,+,chr8:79617048:79617049:+:STMN2
1,chr8,79666156,79666157,STMN2,.,+,chr8:79666156:79666157:+:STMN2


In [78]:
# output BED 
le_3utr_bed = pas_le_3utr_upd[["Score", "Name"]].sort()
le_ale_bed = pas_le_ale_bf_upd[["Score", "Name"]].sort()
le_3utr_bed.to_bed("processed/zeng_2024_3utr_pas_last_exons.bed")
le_ale_bed.to_bed("processed/zeng_2024_ale_pas_last_exons.bed")
pr.concat([le_3utr_bed, le_ale_bed]).sort().to_bed("processed/zeng_2024_combined_pas_last_exons.bed")