In [1]:
import pyranges as pr
import pandas as pd
import numpy as np
from helpers import get_terminal_regions, _df_update_3p
import os


In [2]:
# BED file of proximal + distal PAS contained within genes (used to plot fig2c in Zeng et al. biorxiv)
zeng_bed = pr.read_bed("data/zeng_2024/zeng_pas_fig2c.bed")
zeng_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,629997,629998,MTND2P28,.,+
1,chr1,630367,630368,MTND2P28,.,+
2,chr1,854387,854388,LINC01128,.,+
3,chr1,859444,859445,LINC01128,.,+
4,chr1,1011462,1011463,ISG15,.,+
...,...,...,...,...,...,...
17897,chrY,18932449,18932450,TTTY14,.,-
17898,chrY,19691944,19691945,,.,-
17899,chrY,19692490,19692491,,.,-
17900,chrY,19703866,19703867,KDM5D,.,-


In [3]:
# Prioritised Zeng et al. target genes
with open("data/zeng_2024/zeng_target_genes.txt", "r") as infile:
    zeng_target_genes = [line.rstrip("\n").replace("'","") for line in infile]

zeng_target_genes_df = pd.DataFrame({"gene_name": zeng_target_genes})
zeng_target_genes = set(zeng_target_genes)
zeng_target_genes

{'ABCC5',
 'AGPAT4',
 'ARHGAP32',
 'ARMC10',
 'ARNT',
 'AZIN1',
 'BRD9',
 'CADM1',
 'CCNL2',
 'CHRNB4',
 'CLSTN3',
 'CNPY3',
 'CORO1C',
 'CSNK2A1',
 'DIDO1',
 'DPYSL5',
 'DYNC1LI1',
 'DYRK2',
 'EGFR',
 'EIF4E2',
 'ELAVL4',
 'ELP1',
 'ELP3',
 'ELP6',
 'EMC10',
 'F11R',
 'FOXK2',
 'G3BP1',
 'GGA2',
 'GOLGA7B',
 'GPR173',
 'GREB1',
 'GSTO2',
 'H3-3B',
 'HIF1AN',
 'HNRNPC',
 'JPT1',
 'KLHL42',
 'KPNA4',
 'LRRC3',
 'LSM14A',
 'MARK3',
 'MDGA1',
 'NAV1',
 'NDRG4',
 'NDUFA9',
 'NEFL',
 'NFE2L1',
 'NSMAF',
 'NUCKS1',
 'NUFIP2',
 'OLA1',
 'PAK4',
 'PARD6G',
 'PGRMC2',
 'PPM1A',
 'PTPN9',
 'RAB11A',
 'RFNG',
 'RPN1',
 'SEC14L1',
 'SERF2',
 'SFPQ',
 'SIX3',
 'SLC19A1',
 'SLC24A3',
 'SLC6A8',
 'SLIT3',
 'SMARCA4',
 'SMC1A',
 'SMG7',
 'SREK1',
 'SSR1',
 'SSU72',
 'STMN2',
 'SYP',
 'TAFA5',
 'TARDBP',
 'TEF',
 'TFDP1',
 'TFDP2',
 'TLX1',
 'TMED10',
 'TMEM106B',
 'TTBK1',
 'UBE2H',
 'UBE2R2',
 'ZMAT2',
 'ZNF462'}

In [4]:
# read in Gencode GTF, extract exons
exons = pr.read_gtf("data/gencode.v40.annotation.gtf").subset(lambda df: df.Feature == "exon")
exons = exons[["Feature", "gene_id", "gene_name", "transcript_id", "exon_number", "transcript_type"]]
# define genic regions
genes = exons.boundaries(group_by="gene_name")
exons

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,transcript_type
0,chr1,exon,11868,12227,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,1,processed_transcript
1,chr1,exon,12612,12721,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,2,processed_transcript
2,chr1,exon,13220,14409,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,3,processed_transcript
3,chr1,exon,12009,12057,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,1,transcribed_unprocessed_pseudogene
4,chr1,exon,12178,12227,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,2,transcribed_unprocessed_pseudogene
...,...,...,...,...,...,...,...,...,...,...
1573257,chrY,exon,57214349,57214397,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,1,unprocessed_pseudogene
1573258,chrY,exon,57213879,57213964,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,2,unprocessed_pseudogene
1573259,chrY,exon,57213525,57213602,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,3,unprocessed_pseudogene
1573260,chrY,exon,57213203,57213357,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,4,unprocessed_pseudogene


## Preprocessing step 1 - Consolidate gene name assignment & generate unique PAS ID

Some Name fields weren't filled in BED, need to use gencode annotation to update
Also need to make sure that PAS fall within annotated boundaries, otherwise will never get counting regions for SLAM-seq

In [5]:
# reassign gene name for pas
zeng_bed_gene = zeng_bed.join(genes, strandedness="same", how = "left", slack=10000)
zeng_bed_gene

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,Start_b,End_b,Strand_b,gene_name
0,chr1,629997,629998,MTND2P28,.,+,629061,629433,+,MTND1P23
1,chr1,629997,629998,MTND2P28,.,+,629639,630683,+,MTND2P28
2,chr1,629997,629998,MTND2P28,.,+,631073,632616,+,MTCO1P12
3,chr1,629997,629998,MTND2P28,.,+,632756,633438,+,MTCO2P12
4,chr1,629997,629998,MTND2P28,.,+,633534,633741,+,MTATP8P1
...,...,...,...,...,...,...,...,...,...,...
56639,chrY,19691944,19691945,,.,-,19691940,19694606,-,ENSG00000260197
56640,chrY,19692490,19692491,,.,-,19691940,19694606,-,ENSG00000260197
56641,chrY,19703866,19703867,KDM5D,.,-,19691940,19694606,-,ENSG00000260197
56642,chrY,19703866,19703867,KDM5D,.,-,19703864,19744939,-,KDM5D


In [6]:
# how many PAS do not overlap with any gencode annotated genes (/ within 10kb)
zeng_bed_gene.assign("gene_overlap", lambda df: df.Start_b.ne(-1)).gene_overlap.value_counts()

gene_overlap
True     56641
False        3
Name: count, dtype: int64

In [7]:
# look at nnon-overlaps
zeng_bed_gene.subset(lambda df: df.Start_b == -1)

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,Start_b,End_b,Strand_b,gene_name
0,chr11,134790979,134790980,LINC02714,.,+,-1,-1,+,-1
1,chr13,21332430,21332431,,.,+,-1,-1,+,-1
2,chr13,26420990,26420991,CDK8,.,+,-1,-1,+,-1


In [8]:
# pull out alt PAS for non overlaps, track excluded pas
no_gene_overlap_names = set(zeng_bed_gene.subset(lambda df: df.Start_b == -1).as_df().dropna(subset="Name").Name)
no_gene_overlap_bed = pr.concat([zeng_bed_gene.subset(lambda df: df.Start_b == -1), zeng_bed_gene.subset(lambda df: df.Name.isin(no_gene_overlap_names))]).drop_duplicate_positions()
# exclude from next steps
zeng_bed_gene = zeng_bed_gene.subset(lambda df: (df.Start_b != -1) & ~(df.gene_name.isin(no_gene_overlap_names)))

no_gene_overlap_bed



Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,Start_b,End_b,Strand_b,gene_name
0,chr11,134790979,134790980,LINC02714,.,+,-1,-1,+,-1
1,chr11,134763810,134763811,LINC02714,.,+,134735595,134763810,+,LINC02714
2,chr13,21332430,21332431,,.,+,-1,-1,+,-1
3,chr13,26420990,26420991,CDK8,.,+,-1,-1,+,-1
4,chr13,26405233,26405234,CDK8,.,+,26254103,26405238,+,CDK8


In [9]:
# consolidate gene name - trust original where not NA, otherwise use from gencode
zeng_bed_gene = zeng_bed_gene.assign("gene_name_common", lambda df: pd.Series(np.where(df["Name"].isna(), df["gene_name"], df["Name"])))
zeng_bed_gene

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,Start_b,End_b,Strand_b,gene_name,gene_name_common
0,chr1,629997,629998,MTND2P28,.,+,629061,629433,+,MTND1P23,MTND2P28
1,chr1,629997,629998,MTND2P28,.,+,629639,630683,+,MTND2P28,MTND2P28
2,chr1,629997,629998,MTND2P28,.,+,631073,632616,+,MTCO1P12,MTND2P28
3,chr1,629997,629998,MTND2P28,.,+,632756,633438,+,MTCO2P12,MTND2P28
4,chr1,629997,629998,MTND2P28,.,+,633534,633741,+,MTATP8P1,MTND2P28
...,...,...,...,...,...,...,...,...,...,...,...
56634,chrY,19691944,19691945,,.,-,19691940,19694606,-,ENSG00000260197,ENSG00000260197
56635,chrY,19692490,19692491,,.,-,19691940,19694606,-,ENSG00000260197,ENSG00000260197
56636,chrY,19703866,19703867,KDM5D,.,-,19691940,19694606,-,ENSG00000260197,KDM5D
56637,chrY,19703866,19703867,KDM5D,.,-,19703864,19744939,-,KDM5D,KDM5D


In [10]:
# temp
zeng_bed_gene = zeng_bed_gene.as_df()

In [11]:
# double check that gene names from annotation + names for PAS in BED file match
zeng_bed_gene.loc[:, "gn_match"] = zeng_bed_gene.Name == zeng_bed_gene.gene_name

# may be multiple genic overlaps for each pas, so prefer match where applicable
# Sort by name so matches appear as first row
(zeng_bed_gene.sort_values(by=["Name", "gn_match"], ascending=False)
 # subset to one overlap per PAS, prioritising matches
 .drop_duplicates(subset=["Chromosome", "Start", "End", "Strand"])
 .gn_match.value_counts()
 )


gn_match
True     16804
False     1093
Name: count, dtype: int64

In [12]:
# 1093 PAS where no match with assigned gene name
# how many are NaNs from the BED?
pas_no_match = (zeng_bed_gene.sort_values(by=["Name", "gn_match"], ascending=False)
 # subset to one overlap per PAS, prioritising matches
 .drop_duplicates(subset=["Chromosome", "Start", "End", "Strand"])
 .loc[lambda x: ~x["gn_match"], :]
 )

pas_no_match.Name.isna().value_counts()

Name
True     927
False    166
Name: count, dtype: int64

In [13]:
# 927 just have no assigned gene from BED file (trust gencode)
# what's happening with the 166?
pas_no_match_nna = pas_no_match[~pas_no_match.Name.isna()]
names_no_match_nna = set(pas_no_match_nna.Name)
print(names_no_match_nna)

# are their gene names found in annotation?
print(f"total gene names - {len(names_no_match_nna)}")
print(f"gene name found in annotation - {len(names_no_match_nna.intersection(set(genes.gene_name)))}")
print(names_no_match_nna.intersection(set(genes.gene_name)))

# set(genes.gene_name)

{'ZFTRAF1', 'IFT25', 'ZNF892', 'TCF12-DT', 'FRMD6-AS2', 'CDHR18P', 'BLTP1', 'RPSA2', 'ENTREP3', 'H2BC26', 'LINC01013', 'ENTREP2', 'ZNG1B', 'ATOSA', 'CASTOR3P', 'LINC03051', 'CSTPP1', 'ERCC6L2-AS1', 'KCNA3', 'FAM200C', 'ADAM7-AS1', 'POLR1HASP', 'ABTB3', 'SKIC8', 'ZNG1A', 'C2orf74-DT', 'SLC38A4-AS1', 'ZNG1E', 'GATD1-DT', 'TTC23-AS1', 'LINC03077', 'UQCC6', 'LINC03072', 'LINC02976', 'FLRT2-AS1', 'RAB35-AS1', 'PRPF18', 'UBE2CP5', 'C22orf46P', 'CTDP1-DT', 'HYCC1', 'RNF32-DT', 'ZNG1F', 'LINC02977', 'ZNF496-DT', 'LINC00951', 'TOX-DT', 'MFSD14CP', 'LIPT2-AS1', 'LINC03009', 'EEIG1', 'LINGO2', 'LINC00938', 'BLTP3B', 'TMA7B', 'SLC38A6', 'CFAP90', 'SKIC3', 'MIR3171HG', 'UQCC5', 'HAPSTR1', 'LCDR', 'HYCC2', 'BMAL2', 'LINC02981', 'BLTP3A', 'H2AC25', 'LINC02987', 'IRX2-DT', 'MRFAP1P1', 'PCGF3-AS1', 'MATCAP1', 'MSANTD7', 'LYSET', 'ZNF778-DT', 'ZNF225-AS1', 'PHB1', 'PIERCE2', 'MATCAP2', 'LINC03006', 'PTGR3', 'LASP1NB', 'LINC03026', 'ALG1L1P', 'RIMOC1', 'SCAND3'}
total gene names - 86
gene name found in a

In [14]:
# Are any of Yi's prioritised targets found in this list?
names_no_match_nna.intersection(zeng_target_genes)

set()

No intersection, so will just drop these 166 for now... (but report)

Priority for updating names:
zeng_bed_gene - keep the gn_match and NAs + match, then generate pas_id

In [15]:
# track excluded 166 non na and no match PAS
no_genename_match_bed = pr.PyRanges(pas_no_match[~pas_no_match.Name.isna()].drop(columns=["Start_b", "End_b", "Strand_b", "gene_name", "gene_name_common", "gn_match"]))
no_genename_match_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,247334576,247334577,ZNF496-DT,.,+
1,chr1,247377231,247377232,ZNF496-DT,.,+
2,chr1,228460764,228460765,H2BC26,.,+
3,chr1,228463065,228463066,H2BC26,.,+
4,chr1,110653567,110653568,KCNA3,.,-
...,...,...,...,...,...,...
161,chr22,41698135,41698136,C22orf46P,.,+
162,chrX,153599891,153599892,MRFAP1P1,.,+
163,chrX,153609261,153609262,MRFAP1P1,.,+
164,chrX,98865549,98865550,LINC03077,.,+


In [16]:
# drop the 166, return to pyranges object
zeng_bed_gene = zeng_bed_gene[zeng_bed_gene.gn_match | zeng_bed_gene.Name.isna()]
zeng_bed_gene.loc[:, "pas_id"] = zeng_bed_gene.Chromosome.str.cat(zeng_bed_gene[["Start", "End", "Strand", "gene_name_common"]].astype(str), sep=":")
zeng_bed_upd = pr.PyRanges(zeng_bed_gene.drop(columns=["Start_b", "End_b", "Strand_b", "gene_name", "gn_match"]))
zeng_bed_upd = zeng_bed_upd.drop_duplicate_positions()
zeng_bed_upd

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id
0,chr1,629997,629998,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,630367,630368,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,859444,859445,LINC01128,.,+,LINC01128,chr1:859444:859445:+:LINC01128
4,chr1,1011462,1011463,ISG15,.,+,ISG15,chr1:1011462:1011463:+:ISG15
...,...,...,...,...,...,...,...,...
17726,chrY,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
17727,chrY,19691944,19691945,,.,-,ENSG00000260197,chrY:19691944:19691945:-:ENSG00000260197
17728,chrY,19692490,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197
17729,chrY,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D


In [62]:
# track total input PAS
pas_ids = set(zeng_bed_upd.pas_id)
len(pas_ids)

16101

## Preprocessing step 2 - Check for (non-terminal) exon overlapping PAS

Slam-seq counting performed using featurecounts. To do isoform-specific properly, it is essential that regions for each event are non-ambiguous. If PAS overlaps an internal exon, it's impossible to differentiate between txs using region as an internal exon vs a last exon. Such PAS will need to be removed from further analysis

In [17]:
# Get objects containing last exons and non-last exons

# Extract last exons from annotation
exons = exons.assign("exon_number", lambda df: df.exon_number.astype(float).astype(int))
last_exons = get_terminal_regions(exons, number_type="stranded")
# assign an 'exon id' - basically just the coordinates (so can handle identical LEs x transcripts of same gene)
last_exons = last_exons.assign("exon_id", lambda df: df.Chromosome.str.cat(df[["Start", "End", "Strand", "gene_name"]].astype(str), sep=":"))
# don't need it
last_exons = last_exons.drop("exon_number")
# annotate overlapping last exons with a common ID
last_exons = last_exons.cluster(strand=True)

last_exons

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,transcript_type,exon_id,Cluster
0,chr1,exon,13220,14409,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,processed_transcript,chr1:13220:14409:+:DDX11L1,1
1,chr1,exon,13452,13670,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,transcribed_unprocessed_pseudogene,chr1:13452:13670:+:DDX11L1,1
2,chr1,exon,30365,30503,+,ENSG00000284332.1,MIR1302-2,ENST00000607096.1,miRNA,chr1:30365:30503:+:MIR1302-2,2
3,chr1,exon,30975,31097,+,ENSG00000243485.5,MIR1302-2HG,ENST00000473358.1,lncRNA,chr1:30975:31097:+:MIR1302-2HG,3
4,chr1,exon,30975,31109,+,ENSG00000243485.5,MIR1302-2HG,ENST00000469289.1,lncRNA,chr1:30975:31109:+:MIR1302-2HG,3
...,...,...,...,...,...,...,...,...,...,...,...
246619,chrY,exon,57015104,57016096,-,ENSG00000237801.6_PAR_Y,AMD1P2,ENST00000412936.6_PAR_Y,processed_pseudogene,chrY:57015104:57016096:-:AMD1P2,119470
246620,chrY,exon,57165511,57165845,-,ENSG00000228410.6_PAR_Y,ELOCP24,ENST00000456370.6_PAR_Y,processed_pseudogene,chrY:57165511:57165845:-:ELOCP24,119471
246621,chrY,exon,57171889,57172769,-,ENSG00000223484.7_PAR_Y,TRPC6P,ENST00000421233.6_PAR_Y,processed_pseudogene,chrY:57171889:57172769:-:TRPC6P,119472
246622,chrY,exon,57201142,57202020,-,ENSG00000185203.12_PAR_Y,WASIR1,ENST00000399966.9_PAR_Y,lncRNA,chrY:57201142:57202020:-:WASIR1,119473


In [18]:
# anti-overlap to get non-last exons
nonlast_exons = exons.overlap(last_exons, strandedness="same", invert=True)
nonlast_exons

Unnamed: 0,Chromosome,Feature,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,transcript_type
0,chr1,exon,11868,12227,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,1,processed_transcript
1,chr1,exon,12612,12721,+,ENSG00000223972.5,DDX11L1,ENST00000456328.2,2,processed_transcript
2,chr1,exon,12009,12057,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,1,transcribed_unprocessed_pseudogene
3,chr1,exon,12178,12227,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,2,transcribed_unprocessed_pseudogene
4,chr1,exon,12612,12697,+,ENSG00000223972.5,DDX11L1,ENST00000450305.2,3,transcribed_unprocessed_pseudogene
...,...,...,...,...,...,...,...,...,...,...
935310,chrY,exon,57203181,57203357,-,ENSG00000185203.12_PAR_Y,WASIR1,ENST00000399966.9_PAR_Y,1,lncRNA
935311,chrY,exon,57214349,57214397,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,1,unprocessed_pseudogene
935312,chrY,exon,57213879,57213964,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,2,unprocessed_pseudogene
935313,chrY,exon,57213525,57213602,-,ENSG00000227159.8_PAR_Y,DDX11L16,ENST00000507418.6_PAR_Y,3,unprocessed_pseudogene


In [19]:
# find PAS overlapping with non-terminal exons
pas_nonlast_olap = zeng_bed_upd.join(nonlast_exons, strandedness="same",how="left")
pas_nonlast_olap

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id,Feature,Start_b,End_b,Strand_b,gene_id,gene_name,transcript_id,exon_number,transcript_type
0,chr1,15727029,15727030,PLEKHM2,.,+,PLEKHM2,chr1:15727029:15727030:+:PLEKHM2,exon,15727013,15727832,+,ENSG00000116786.13,PLEKHM2,ENST00000375799.8,9,protein_coding
1,chr1,15727029,15727030,PLEKHM2,.,+,PLEKHM2,chr1:15727029:15727030:+:PLEKHM2,exon,15727013,15727832,+,ENSG00000116786.13,PLEKHM2,ENST00000375793.2,8,protein_coding
2,chr1,15727029,15727030,PLEKHM2,.,+,PLEKHM2,chr1:15727029:15727030:+:PLEKHM2,exon,15727013,15727068,+,ENSG00000116786.13,PLEKHM2,ENST00000642363.1,9,protein_coding
3,chr1,19904972,19904973,OTUD3,.,+,OTUD3,chr1:19904972:19904973:+:OTUD3,exon,19904890,19904987,+,ENSG00000169914.6,OTUD3,ENST00000375120.4,6,protein_coding
4,chr1,26268506,26268507,CEP85,.,+,CEP85,chr1:26268506:26268507:+:CEP85,exon,26268482,26268635,+,ENSG00000130695.16,CEP85,ENST00000640292.2,7,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19864,chrY,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14,-1,-1,-1,-,-1,-1,-1,-1,-1
19865,chrY,19691944,19691945,,.,-,ENSG00000260197,chrY:19691944:19691945:-:ENSG00000260197,-1,-1,-1,-,-1,-1,-1,-1,-1
19866,chrY,19692490,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197,-1,-1,-1,-,-1,-1,-1,-1,-1
19867,chrY,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D,-1,-1,-1,-,-1,-1,-1,-1,-1


In [20]:
# Count PAS overlapping with non-last exons
(pas_nonlast_olap.as_df()
 .assign(olap=lambda df: df["Start_b"].ne(-1))
 .drop_duplicates(subset=["pas_id"])
 .olap.value_counts()
 )

olap
False    17242
True       489
Name: count, dtype: int64

In [21]:
# isolate exon-overlapping, track gene names to remvoe their counterpart PAS
nonlast_olap_bed = (pas_nonlast_olap.subset(lambda df: df.Start_b.ne(-1))
 .drop_duplicate_positions()
 .drop(like="_b$")
 .drop(["Feature", "gene_id", "gene_name", "transcript_id", "exon_number", "transcript_type"])
 )

# gene names
nonlast_olap_names = set(nonlast_olap_bed.gene_name_common)

nonlast_olap_bed

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id
0,chr1,15727029,15727030,PLEKHM2,.,+,PLEKHM2,chr1:15727029:15727030:+:PLEKHM2
1,chr1,19904972,19904973,OTUD3,.,+,OTUD3,chr1:19904972:19904973:+:OTUD3
2,chr1,26268506,26268507,CEP85,.,+,CEP85,chr1:26268506:26268507:+:CEP85
3,chr1,32031568,32031569,KHDRBS1,.,+,KHDRBS1,chr1:32031568:32031569:+:KHDRBS1
4,chr1,40059359,40059360,CAP1,.,+,CAP1,chr1:40059359:40059360:+:CAP1
...,...,...,...,...,...,...,...,...
484,chrX,119841736,119841737,UPF3B,.,-,UPF3B,chrX:119841736:119841737:-:UPF3B
485,chrX,131768620,131768621,FIRRE,.,-,FIRRE,chrX:131768620:131768621:-:FIRRE
486,chrX,136225931,136225932,MAP7D3,.,-,MAP7D3,chrX:136225931:136225932:-:MAP7D3
487,chrX,139588359,139588360,MCF2,.,-,MCF2,chrX:139588359:139588360:-:MCF2


In [22]:
# remove the exon overlapping PAS (+ their counterparts)
zeng_bed_upd = (pas_nonlast_olap.subset(lambda df: df.Start_b.eq(-1))
                .subset(lambda df: ~df.gene_name_common.isin(nonlast_olap_names))
                .drop(like="_b$")
                .drop(["Feature", "gene_id", "gene_name", "transcript_id", "exon_number", "transcript_type"])
                )

zeng_bed_upd

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id
0,chr1,629997,629998,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,630367,630368,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,859444,859445,LINC01128,.,+,LINC01128,chr1:859444:859445:+:LINC01128
4,chr1,1011462,1011463,ISG15,.,+,ISG15,chr1:1011462:1011463:+:ISG15
...,...,...,...,...,...,...,...,...
16096,chrY,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
16097,chrY,19691944,19691945,,.,-,ENSG00000260197,chrY:19691944:19691945:-:ENSG00000260197
16098,chrY,19692490,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197
16099,chrY,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D


In [24]:
# check which of provided targets are lost to exon overlap
zeng_targets_nonlast = zeng_target_genes.intersection(nonlast_olap_names)
zeng_targets_nonlast

{'BRD9', 'CLSTN3', 'HNRNPC', 'LSM14A', 'SEC14L1', 'SLC24A3'}

## Define tandem 3'UTR regions


In [28]:
# Assign PAS to last exons based on overlap
# allow small slack of 100nt, so PAS close to annotated bou3'ends are considered overlapping
pas_le = last_exons[["exon_id", "Cluster"]].join(zeng_bed_upd, strandedness="same", slack=100)
pas_le

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Name,Score,Strand_b,gene_name_common,pas_id
0,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,629997,629998,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,630367,630368,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,853390,854398,+,chr1:853390:854398:+:LINC01128,35,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,853390,854387,+,chr1:853390:854387:+:LINC01128,35,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
4,chr1,853390,854385,+,chr1:853390:854385:+:LINC01128,35,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46186,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D
46187,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19705419,19705420,KDM5D,.,-,KDM5D,chrY:19705419:19705420:-:KDM5D
46188,chrY,19705416,19706345,-,chrY:19705416:19706345:-:KDM5D,119349,19705419,19705420,KDM5D,.,-,KDM5D,chrY:19705419:19705420:-:KDM5D
46189,chrY,19705419,19706345,-,chrY:19705419:19706345:-:KDM5D,119349,19705419,19705420,KDM5D,.,-,KDM5D,chrY:19705419:19705420:-:KDM5D


In [38]:
# Count the number of unique PAS overlapping last exons.
# do this at gene level, so interpretation = at least one annot last exon has x overlapping PAS
# (could have problems with shorter annotated last exons of same gene only overlapping with proximal PAS)
# (2 = 3'UTR PAS, 1 = ALEs)
num_olap_pas = pas_le.as_df().groupby("exon_id")["pas_id"].nunique().reset_index()
num_olap_pas.loc[:, "gene_name"] = num_olap_pas.exon_id.str.split(":", expand=True, regex=False)[4]

# number of putative 3'UTR APA / ALE PAS
num_olap_pas_genemax = num_olap_pas.groupby("gene_name")["pas_id"].max()
num_olap_pas_genemax.value_counts()

pas_id
2    5000
1    2909
3       5
Name: count, dtype: int64

In [33]:
# Get last exons with two overlapping PAS & subset for joined last exon + pas coordinates
tandem_exons_ids = set(num_olap_pas.loc[num_olap_pas["pas_id"] == 2, "exon_id"])


pas_le_3utr = pas_le.subset(lambda df: df.exon_id.isin(tandem_exons_ids))
pas_le_3utr

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Name,Score,Strand_b,gene_name_common,pas_id
0,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,629997,629998,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630683,+,chr1:629639:630683:+:MTND2P28,16,630367,630368,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,853390,859446,+,chr1:853390:859446:+:LINC01128,35,854387,854388,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,853390,859446,+,chr1:853390:859446:+:LINC01128,35,859444,859445,LINC01128,.,+,LINC01128,chr1:859444:859445:+:LINC01128
4,chr1,1065829,1066449,+,chr1:1065829:1066449:+:ENSG00000217801,57,1066354,1066355,,.,+,ENSG00000217801,chr1:1066354:1066355:+:ENSG00000217801
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24793,chrY,19691940,19694606,-,chrY:19691940:19694606:-:ENSG00000260197,119348,19692490,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197
24794,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D
24795,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19705419,19705420,KDM5D,.,-,KDM5D,chrY:19705419:19705420:-:KDM5D
24796,chrY,19703864,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703866,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D


In [34]:
# update 3' exon coordinates to the PAS coordinate
pas_le_3utr_upd = pas_le_3utr.apply(lambda df: _df_update_3p(df))
pas_le_3utr_upd

Number of negative or zero-length updated intervals to be dropped - 1312
Number of negative or zero-length updated intervals to be dropped - 1304
Number of negative or zero-length updated intervals to be dropped - 1040
Number of negative or zero-length updated intervals to be dropped - 822
Number of negative or zero-length updated intervals to be dropped - 776
Number of negative or zero-length updated intervals to be dropped - 790
Number of negative or zero-length updated intervals to be dropped - 568
Number of negative or zero-length updated intervals to be dropped - 542
Number of negative or zero-length updated intervals to be dropped - 680
Number of negative or zero-length updated intervals to be dropped - 668
Number of negative or zero-length updated intervals to be dropped - 472
Number of negative or zero-length updated intervals to be dropped - 720
Number of negative or zero-length updated intervals to be dropped - 596
Number of negative or zero-length updated intervals to be dro

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Name,Score,Strand_b,gene_name_common,pas_id
0,chr1,629639,629998,+,chr1:629639:630683:+:MTND2P28,16,629997,630683,MTND2P28,.,+,MTND2P28,chr1:629997:629998:+:MTND2P28
1,chr1,629639,630368,+,chr1:629639:630683:+:MTND2P28,16,630367,630683,MTND2P28,.,+,MTND2P28,chr1:630367:630368:+:MTND2P28
2,chr1,853390,854388,+,chr1:853390:859446:+:LINC01128,35,854387,859446,LINC01128,.,+,LINC01128,chr1:854387:854388:+:LINC01128
3,chr1,853390,859445,+,chr1:853390:859446:+:LINC01128,35,859444,859446,LINC01128,.,+,LINC01128,chr1:859444:859445:+:LINC01128
4,chr1,1065829,1066355,+,chr1:1065829:1066449:+:ENSG00000217801,57,1066354,1066449,,.,+,ENSG00000217801,chr1:1066354:1066355:+:ENSG00000217801
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24671,chrY,19692490,19694606,-,chrY:19691940:19694606:-:ENSG00000260197,119348,19691940,19692491,,.,-,ENSG00000260197,chrY:19692490:19692491:-:ENSG00000260197
24672,chrY,19703866,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703864,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D
24673,chrY,19705419,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703864,19705420,KDM5D,.,-,KDM5D,chrY:19705419:19705420:-:KDM5D
24674,chrY,19703866,19706345,-,chrY:19703864:19706345:-:KDM5D,119349,19703864,19703867,KDM5D,.,-,KDM5D,chrY:19703866:19703867:-:KDM5D


In [36]:
# target genes fitting tandem criteria
tandem_genes = set(pas_le_3utr_upd.gene_name_common)
zeng_targets_tandem = zeng_target_genes.intersection(tandem_genes)
len(zeng_targets_tandem)

55

## Define ALE regions

In theory these are genes with only 1 pas overlapping each of two distinct ALEs.


In [41]:
# get putative ALEs/ALE-overlapping IDs
# i.e. genes with max exon-PAS overlap count of 1, becasue these do not have any tandem APA
put_ale_genes = set(num_olap_pas_genemax[num_olap_pas_genemax.eq(1)].index)
put_ale_ids = set(num_olap_pas.loc[num_olap_pas.gene_name.isin(put_ale_genes), "exon_id"])

pas_le_ale = pas_le.subset(lambda df: df.exon_id.isin(put_ale_ids))
pas_le_ale

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Name,Score,Strand_b,gene_name_common,pas_id
0,chr1,1013983,1014540,+,chr1:1013983:1014540:+:ISG15,51,1014537,1014538,ISG15,.,+,ISG15,chr1:1014537:1014538:+:ISG15
1,chr1,1013983,1014540,+,chr1:1013983:1014540:+:ISG15,51,1014537,1014538,ISG15,.,+,ISG15,chr1:1014537:1014538:+:ISG15
2,chr1,1616507,1617898,+,chr1:1616507:1617898:+:MIB2,100,1617320,1617321,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
3,chr1,1616507,1619210,+,chr1:1616507:1619210:+:MIB2,100,1617320,1617321,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
4,chr1,1617077,1617323,+,chr1:1617077:1617323:+:MIB2,100,1617320,1617321,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13590,chrY,18930470,18932841,-,chrY:18930470:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
13591,chrY,18932006,18932841,-,chrY:18932006:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
13592,chrY,18932316,18932841,-,chrY:18932316:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
13593,chrY,18932435,18932841,-,chrY:18932435:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14


In [44]:
# true ALEs should be non-overlapping.
# Each gene should therefore have two unique PAS, and two unique Cluster values

# count unqiue pas + non-overlapping ales
ale_uniq_counts = pas_le_ale.as_df().groupby("gene_name_common")[["Cluster", "pas_id"]].nunique().reset_index()

# quick check counts
ale_uniq_counts[["Cluster","pas_id"]].value_counts()

Cluster  pas_id
1        1         1704
2        2         1116
1        2           14
3        2           12
2        1           10
3        3            1
4        2            1
         4            1
Name: count, dtype: int64

In [47]:
# 2 + 2 represent 'bonafide' ALEs - extract + report
# get last exons with two overlapping PAS & subset for joined last exon + pas coordinates
# get IDs for 'bonafide' ALEs
ale_bf_gene_names = ale_uniq_counts.loc[(ale_uniq_counts["Cluster"] == 2) & (ale_uniq_counts["pas_id"] == 2), "gene_name_common"]
pas_le_ale_bf = pas_le_ale.subset(lambda df: df.gene_name_common.isin(ale_bf_gene_names))
pas_le_ale_bf

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Name,Score,Strand_b,gene_name_common,pas_id
0,chr1,1616507,1617898,+,chr1:1616507:1617898:+:MIB2,100,1617320,1617321,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
1,chr1,1616507,1619210,+,chr1:1616507:1619210:+:MIB2,100,1617320,1617321,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
2,chr1,1617077,1617323,+,chr1:1617077:1617323:+:MIB2,100,1617320,1617321,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
3,chr1,1630291,1630604,+,chr1:1630291:1630604:+:MIB2,109,1630604,1630605,MIB2,.,+,MIB2,chr1:1630604:1630605:+:MIB2
4,chr1,1630291,1630604,+,chr1:1630291:1630604:+:MIB2,109,1630604,1630605,MIB2,.,+,MIB2,chr1:1630604:1630605:+:MIB2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7341,chrY,18930470,18932841,-,chrY:18930470:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
7342,chrY,18932006,18932841,-,chrY:18932006:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
7343,chrY,18932316,18932841,-,chrY:18932316:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14
7344,chrY,18932435,18932841,-,chrY:18932435:18932841:-:TTTY14,119340,18932449,18932450,TTTY14,.,-,TTTY14,chrY:18932449:18932450:-:TTTY14


In [48]:
# update 3' exon coordinates to the PAS coordinate
pas_le_ale_bf_upd = pas_le_ale_bf.apply(lambda df: _df_update_3p(df))
pas_le_ale_bf_upd = pas_le_ale_bf_upd.drop_duplicate_positions(strand=True)
pas_le_ale_bf_upd

Number of negative or zero-length updated intervals to be dropped - 411
Number of negative or zero-length updated intervals to be dropped - 282
Number of negative or zero-length updated intervals to be dropped - 396
Number of negative or zero-length updated intervals to be dropped - 264
Number of negative or zero-length updated intervals to be dropped - 212
Number of negative or zero-length updated intervals to be dropped - 125
Number of negative or zero-length updated intervals to be dropped - 187
Number of negative or zero-length updated intervals to be dropped - 170
Number of negative or zero-length updated intervals to be dropped - 128
Number of negative or zero-length updated intervals to be dropped - 61
Number of negative or zero-length updated intervals to be dropped - 171
Number of negative or zero-length updated intervals to be dropped - 220
Number of negative or zero-length updated intervals to be dropped - 210
Number of negative or zero-length updated intervals to be dropped

Unnamed: 0,Chromosome,Start,End,Strand,exon_id,Cluster,Start_b,End_b,Name,Score,Strand_b,gene_name_common,pas_id
0,chr1,1616507,1617321,+,chr1:1616507:1617898:+:MIB2,100,1617320,1617898,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
1,chr1,1617077,1617321,+,chr1:1617077:1617323:+:MIB2,100,1617320,1617323,MIB2,.,+,MIB2,chr1:1617320:1617321:+:MIB2
2,chr1,1630291,1630605,+,chr1:1630291:1630604:+:MIB2,109,1630604,1630604,MIB2,.,+,MIB2,chr1:1630604:1630605:+:MIB2
3,chr1,6632795,6633562,+,chr1:6632795:6633562:+:THAP3,248,6633561,6633562,THAP3,.,+,THAP3,chr1:6633561:6633562:+:THAP3
4,chr1,6634019,6635586,+,chr1:6634019:6635586:+:THAP3,249,6635585,6635586,THAP3,.,+,THAP3,chr1:6635585:6635586:+:THAP3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2643,chrY,19593087,19594158,+,chrY:19593087:19594161:+:TXLNGY,118984,19594157,19594161,,.,+,TXLNGY,chrY:19594157:19594158:+:TXLNGY
2644,chrY,13234577,13234826,-,chrY:13234576:13234826:-:UTY,119277,13234576,13234578,UTY,.,-,UTY,chrY:13234577:13234578:-:UTY
2645,chrY,13248385,13249882,-,chrY:13248378:13249882:-:UTY,119278,13248378,13248386,UTY,.,-,UTY,chrY:13248385:13248386:-:UTY
2646,chrY,18872593,18872834,-,chrY:18872500:18872834:-:TTTY14,119337,18872500,18872594,TTTY14,.,-,TTTY14,chrY:18872593:18872594:-:TTTY14


In [54]:
# track target genes classified as bonafide ALEs
# target genes fitting tandem criteria
zeng_targets_ale_bf = zeng_target_genes.intersection(ale_bf_gene_names)
print(len(zeng_targets_ale_bf))
zeng_targets_ale_bf

11


{'ABCC5',
 'CCNL2',
 'DIDO1',
 'EIF4E2',
 'ELAVL4',
 'GREB1',
 'SFPQ',
 'SLC19A1',
 'SSU72',
 'SYP',
 'TFDP1'}

In [57]:
# what are the unnacounted for target genes?
zeng_int_unaccounted = zeng_target_genes.difference(zeng_targets_tandem.union(zeng_targets_nonlast).union(zeng_targets_ale_bf))
print(len(zeng_int_unaccounted))
zeng_int_unaccounted

17


{'ARHGAP32',
 'CHRNB4',
 'CNPY3',
 'EGFR',
 'ELP6',
 'GSTO2',
 'NSMAF',
 'PAK4',
 'PARD6G',
 'RFNG',
 'SIX3',
 'SLIT3',
 'SMG7',
 'STMN2',
 'TAFA5',
 'TLX1',
 'TTBK1'}

ARHGAP32, CNPY3, RFNG, SIX3, STMN2, TLX1 are known events I've captured with my pipeline, so I'll add them in via those models. For the remaining, need to dig deeper

In [60]:
papa_hits = "ARHGAP32, CNPY3, RFNG, SIX3, STMN2, TLX1".split(", ")
zeng_int_unaccounted = zeng_int_unaccounted.difference(papa_hits)
zeng_int_unaccounted

{'CHRNB4',
 'EGFR',
 'ELP6',
 'GSTO2',
 'NSMAF',
 'PAK4',
 'PARD6G',
 'SLIT3',
 'SMG7',
 'TAFA5',
 'TTBK1'}

## Rescuing missing targets - novel extensions of annotated last exons? Or just novel events

Find nearest upstream ALEs for the missing targets. With nearest approach, overlapping PAS will be assigned 0, so can assess whether both PAS are accounted for.

Realistically assessing two points:
1. Are PAS just downstream of annotated last exons? In which case, can rescue these events by attaching to and extending annotated event
2. Do they represent unannotated last exons? In which case, quite tricky to use with bulk RNA-seq. If deep intronic, could just extend a specified distance upstream of the PAS to define a counting window

In [65]:
# subset to unnacounted PAS
missing_pas_ids = pas_ids.difference(set(pas_le_3utr_upd.pas_id)).difference(set(pas_le_ale_bf_upd.pas_id))
len(missing_pas_ids)
zeng_bed_upd_missing = zeng_bed_upd.subset(lambda df: df.pas_id.isin(missing_pas_ids))
zeng_bed_upd_missing

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id
0,chr1,1011462,1011463,ISG15,.,+,ISG15,chr1:1011462:1011463:+:ISG15
1,chr1,1014537,1014538,ISG15,.,+,ISG15,chr1:1014537:1014538:+:ISG15
2,chr1,1728507,1728508,,.,+,ENSG00000227775,chr1:1728507:1728508:+:ENSG00000227775
3,chr1,1735571,1735572,,.,+,ENSG00000227775,chr1:1735571:1735572:+:ENSG00000227775
4,chr1,10275251,10275252,KIF1B,.,+,KIF1B,chr1:10275251:10275252:+:KIF1B
...,...,...,...,...,...,...,...,...
3940,chrY,13711999,13712000,,.,+,TMSB4Y,chrY:13711999:13712000:+:TMSB4Y
3941,chrY,13889010,13889011,ANOS2P,.,+,ANOS2P,chrY:13889010:13889011:+:ANOS2P
3942,chrY,13899472,13899473,ANOS2P,.,+,ANOS2P,chrY:13899472:13899473:+:ANOS2P
3943,chrY,12326385,12326386,,.,-,GYG2P1,chrY:12326385:12326386:-:GYG2P1


In [66]:
# calculate distance to nearest (upstream) last exon (allowing overlaps)
pas_le_missing_nr = zeng_bed_upd_missing.nearest(last_exons, strandedness="same", overlap=True,how="upstream",)
pas_le_missing_nr


Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,gene_name_common,pas_id,Feature,Start_b,End_b,Strand_b,gene_id,gene_name,transcript_id,transcript_type,exon_id,Cluster,Distance
0,chr1,1014537,1014538,ISG15,.,+,ISG15,chr1:1014537:1014538:+:ISG15,exon,1013983,1014540,+,ENSG00000187608.10,ISG15,ENST00000624697.4,protein_coding,chr1:1013983:1014540:+:ISG15,51,0
1,chr1,10381599,10381600,KIF1B,.,+,KIF1B,chr1:10381599:10381600:+:KIF1B,exon,10376544,10381603,+,ENSG00000054523.20,KIF1B,ENST00000377086.5,protein_coding,chr1:10376544:10381603:+:KIF1B,334,0
2,chr1,10630756,10630757,PEX14,.,+,PEX14,chr1:10630756:10630757:+:PEX14,exon,10629530,10630758,+,ENSG00000142655.13,PEX14,ENST00000356607.9,protein_coding,chr1:10629530:10630758:+:PEX14,352,0
3,chr1,15940455,15940456,SPEN,.,+,SPEN,chr1:15940455:15940456:+:SPEN,exon,15939295,15940456,+,ENSG00000065526.12,SPEN,ENST00000375759.8,protein_coding,chr1:15939295:15940456:+:SPEN,517,0
4,chr1,21484899,21484900,NBPF3,.,+,NBPF3,chr1:21484899:21484900:+:NBPF3,exon,21483142,21485005,+,ENSG00000142794.19,NBPF3,ENST00000318220.10,nonsense_mediated_decay,chr1:21483142:21485005:+:NBPF3,653,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940,chrY,13711999,13712000,,.,+,TMSB4Y,chrY:13711999:13712000:+:TMSB4Y,exon,13705224,13706024,+,ENSG00000154620.6,TMSB4Y,ENST00000284856.4,protein_coding,chrY:13705224:13706024:+:TMSB4Y,118922,5976
3941,chrY,13889010,13889011,ANOS2P,.,+,ANOS2P,chrY:13889010:13889011:+:ANOS2P,exon,13887257,13888980,+,ENSG00000241859.7,ANOS2P,ENST00000652544.1,processed_transcript,chrY:13887257:13888980:+:ANOS2P,118925,31
3942,chrY,13899472,13899473,ANOS2P,.,+,ANOS2P,chrY:13899472:13899473:+:ANOS2P,exon,13887257,13888980,+,ENSG00000241859.7,ANOS2P,ENST00000652544.1,processed_transcript,chrY:13887257:13888980:+:ANOS2P,118925,10493
3943,chrY,12378537,12378538,,.,-,GYG2P1,chrY:12378537:12378538:-:GYG2P1,exon,12378531,12378749,-,ENSG00000206159.12,GYG2P1,ENST00000382965.3,transcribed_unprocessed_pseudogene,chrY:12378531:12378749:-:GYG2P1,119266,0


In [73]:
# for each gene, report the min and max distance (i.e. how many genes are just missing annotated last exon for 1 PAS?)
dist_min_max = pas_le_missing_nr.as_df().groupby("gene_name_common")["Distance"].agg([min, max]).reset_index().sort_values(by=["min", "max"])

# gene counts for 
# both overlap last exons (True, True),
# 1 overlaps last exon (True, False) 
# and neither overlaps (False, False)
dist_min_max.drop(columns="gene_name_common").eq(0,axis="columns").value_counts()

min    max  
True   False    1592
False  False     356
True   True       46
Name: count, dtype: int64

In [75]:
dist_min_max[dist_min_max.gene_name_common.isin(zeng_int_unaccounted)]

Unnamed: 0,gene_name_common,min,max
1528,SLIT3,0,743
487,ELP6,0,989
1534,SMG7,0,1902
1753,TTBK1,0,4036
683,GSTO2,0,4802
1175,PARD6G,0,7662
307,CHRNB4,0,28792
1168,PAK4,0,38634
475,EGFR,0,83003
1632,TAFA5,0,144633


In [81]:
# assign to overlap group based on both overlap last exons (2), 1 overlaps last exon (1) or neither (0) 
dist_min_max.loc[:, "overlap_group"] = dist_min_max.drop(columns="gene_name_common").eq(0,axis="columns").sum(axis="columns")
dist_min_max

Unnamed: 0,gene_name_common,min,max,overlap_group
80,APEX1,0,0,2
135,ATXN2,0,0,2
143,B4GALNT1,0,0,2
358,CPT1B,0,0,2
377,CTSL,0,0,2
...,...,...,...,...
1853,XKR4,202640,233429,0
1658,TEX10,239536,241106,0
1872,ZCCHC17,259686,274242,0
1121,NR4A2,436250,441767,0


In [84]:
# try different max distance thresholds (how many genes rescued?)
print(f"Number of genes - {dist_min_max.loc[dist_min_max.overlap_group.ne(0), 'gene_name_common'].nunique()}")
{str(cutoff): len(dist_min_max[(dist_min_max["overlap_group"].ne(0)) & (dist_min_max["max"] <= cutoff)]) for cutoff in [0,25,50,100,200,500,1000, 2500, 5000]}


Number of genes - 1638


{'0': 46,
 '25': 55,
 '50': 55,
 '100': 58,
 '200': 92,
 '500': 162,
 '1000': 251,
 '2500': 487,
 '5000': 705}