In [8]:
import pyranges as pr
import pandas as pd
from Bio.Seq import Seq
from peptide_helpers import *

In [9]:
# First define set of IDs & last exons for which to extract peptide seqs
bleeds = pd.read_csv("data/bleedthrough_manual_validation.tsv", sep="\t")

bleeds_y = bleeds[bleeds["event_manual_validation"] == "yes"]
# filter for at least 1 i3 cortical dataset
bleeds_y = bleeds_y[bleeds_y.experiment_name.str.contains("i3_cortical", regex=False)]
bleeds_gn = bleeds_y.gene_name.unique().tolist()
bleeds_le = bleeds_y.le_id.unique().tolist()

print(f"Number of cryptics - {bleeds_y.le_id.nunique()}")
bleeds_y


Number of cryptics - 18


Unnamed: 0,le_id,gene_name,simple_event_type,n_exper_cryptic,experiment_name,annot_status,dataset,event_manual_validation,cryptic_riboseq_reads,riboseq_downstream_change,notes
0,ENSG00000137161.18_1,CNPY3,bleedthrough,13,"humphrey_i3_cortical,seddighi_i3_cortical,zano...",novel,,yes,yes,unclear,reads in cryptic also downstream of stops in a...
2,ENSG00000103404.15_1,USP31,bleedthrough,8,"humphrey_i3_cortical,zanovello_i3_cortical_upf...",novel,,yes,no,unclear,
6,ENSG00000105186.16_3,ANKRD27,bleedthrough,5,"brown_i3_cortical,humphrey_i3_cortical,seddigh...",novel,,yes,yes,yes,
9,ENSG00000142794.19_3,NBPF3,bleedthrough,4,"brown_skndz,humphrey_i3_cortical,zanovello_shs...",novel,,yes,unclear,unclear,expressed in ctls but likely cryptic by ratio ...
10,ENSG00000078124.13_1,ACER3,bleedthrough,3,"brown_i3_cortical,seddighi_i3_cortical,zanovel...",novel,,yes,few reads,unclear,"coverage quite low, but reads consistently up ..."
11,ENSG00000119392.16_1,GLE1,bleedthrough,3,zanovello_i3_cortical_upf1_tdp_tdpkd_upf1ctl_v...,novel,zanovello_shsy5y_curve_0075,yes,,,3'end of bleedthroughs are truncated. lowly ex...
12,ENSG00000127511.10_2,SIN3B,bleedthrough,3,"brown_i3_cortical,humphrey_i3_cortical,seddigh...",novel,,yes,no,yes,cryptic CDS terminates 1 codon downstream of SJ
17,ENSG00000155897.10_1,ADCY8,bleedthrough,2,"humphrey_i3_cortical,zanovello_i3_cortical_upf...",novel,,yes,no,no,seddighi - event looks real but v lowly expres...
18,ENSG00000162944.11_1,RFTN2,bleedthrough,2,"humphrey_i3_cortical,zanovello_i3_cortical_upf...",novel,,yes,no,no,"very clean, obvious downreg of RNA downstream...."
24,ENSG00000101624.11_3,CEP76,bleedthrough,1,brown_i3_cortical,annotated,,yes,no,no,"looks like real 3'UTR, but unclear whether cry..."


In [10]:
bleeds_y.loc[bleeds_y.gene_name == "NBPF3", "experiment_name"].tolist()

['brown_skndz,humphrey_i3_cortical,zanovello_shsy5y_curve_0021,zanovello_shsy5y_curve_0075']

In [31]:
# read in consolidated bleedthrough coordinates 
# NB: THESE ARE THE UNIQUE REGION COORDINATES, so not used to extract peptides (but instead used to provide coordinates)
bleeds_le = pr.read_bed("data/2023-07-04_papa_cryptic_bleedthrough.last_exons.bed")
bleeds_le_cryp = bleeds_le[bleeds_le.Name.str.contains("cryptic",regex=False)]
# pull out gene name and le_id from the Name field
bleeds_le_cryp = bleeds_le_cryp.apply(lambda df: pd.concat([df, df.Name.str.split("\\|", expand=True)[[0,1]].rename(columns={0: "le_id", 1: "gene_name"})], axis=1))
bleeds_le_cryp

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,le_id,gene_name
0,chr1,147622531,147623356,ENSG00000116128.12_2|BCL9|bleedthrough|cryptic,.,+,ENSG00000116128.12_2,BCL9
1,chr1,45013791,45013889,ENSG00000126088.14_1|UROD|bleedthrough|cryptic,.,+,ENSG00000126088.14_1,UROD
2,chr1,21453372,21457150,ENSG00000142794.19_3|NBPF3|bleedthrough|cryptic,.,+,ENSG00000142794.19_3,NBPF3
3,chr1,156139102,156140091,ENSG00000160789.24_11|LMNA|bleedthrough|cryptic,.,+,ENSG00000160789.24_11,LMNA
4,chr1,1616614,1619210,ENSG00000197530.13_1|MIB2|bleedthrough|cryptic,.,+,ENSG00000197530.13_1,MIB2
...,...,...,...,...,...,...,...,...
69,chr21,42849225,42850040,ENSG00000160193.12_1|WDR4|bleedthrough|cryptic,.,-,ENSG00000160193.12_1,WDR4
70,chrX,91882906,91891321,ENSG00000102290.23_3|PCDH11X|bleedthrough|cryptic,.,+,ENSG00000102290.23_3,PCDH11X
71,chrX,107087378,107088435,ENSG00000089682.17_2|RBM41|bleedthrough|cryptic,.,-,ENSG00000089682.17_2,RBM41
72,chrX,40653641,40654363,ENSG00000180182.11_3|MED14|bleedthrough|cryptic,.,-,ENSG00000180182.11_3,MED14


In [12]:


# bleeds_le_cryp

In [13]:
# alt approach so have full last exon sequence
full_bleeds_le = pr.read_gtf("data/novel_ref_combined.last_exons.gtf")
full_bleeds_le_mv =  full_bleeds_le.subset(lambda df: df["le_id"].isin(set(bleeds_y["le_id"])))

full_bleeds_le_mv

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,,,,,,last_exon_extension,ENSG00000142794.19,NBPF3,3.0,ENSG00000142794.19_3
1,chr1,.,exon,84561927,84563418,.,-,.,PAPA.ctrl_ctrl_1.1091,,...,,,,,,internal_exon_extension,ENSG00000117151.13,CTBS,1.0,ENSG00000117151.13_1
2,chr2,.,exon,225651431,225658565,.,+,.,PAPA.TDP43_ctrl_3.4451,,...,,,,,,internal_exon_extension,ENSG00000144460.13,NYAP2,2.0,ENSG00000144460.13_2
3,chr2,.,exon,197630416,197631220,.,-,.,PAPA.TDP43_ctrl_4.4119,,...,,,,,,internal_exon_extension,ENSG00000162944.11,RFTN2,1.0,ENSG00000162944.11_1
4,chr2,.,exon,197630635,197631220,.,-,.,PAPA.TDP-4.4038,,...,,,,,,internal_exon_extension,ENSG00000162944.11,RFTN2,1.0,ENSG00000162944.11_1
5,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43_19065413_S19.7930,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
6,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43-G_S7.6392,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
7,chr5,.,exon,148377588,148378845,.,-,.,PAPA.Cont-C_S3.6203,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
8,chr5,.,exon,148377695,148378845,.,-,.,PAPA.Cont-D_S4.6266,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
9,chr5,.,exon,148378366,148378845,.,-,.,PAPA.TDP43-E_S5.6788,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1


In [14]:
full_bleeds_le_mv = full_bleeds_le_mv[["le_id","transcript_id", "ref_gene_name", "ref_gene_id"]]
full_bleeds_le_mv

Unnamed: 0,Chromosome,Start,End,Strand,le_id,transcript_id,ref_gene_name,ref_gene_id
0,chr1,21453372,21457150,+,ENSG00000142794.19_3,PAPA.TDP-1.345.1,NBPF3,ENSG00000142794.19
1,chr1,84561927,84563418,-,ENSG00000117151.13_1,PAPA.ctrl_ctrl_1.1091.9,CTBS,ENSG00000117151.13
2,chr2,225651431,225658565,+,ENSG00000144460.13_2,PAPA.TDP43_ctrl_3.4451.2,NYAP2,ENSG00000144460.13
3,chr2,197630416,197631220,-,ENSG00000162944.11_1,PAPA.TDP43_ctrl_4.4119.3,RFTN2,ENSG00000162944.11
4,chr2,197630635,197631220,-,ENSG00000162944.11_1,PAPA.TDP-4.4038.5,RFTN2,ENSG00000162944.11
5,chr5,148377588,148378845,-,ENSG00000247199.6_1,PAPA.TDP43_19065413_S19.7930.4,FBXO38-DT,ENSG00000247199.6
6,chr5,148377588,148378845,-,ENSG00000247199.6_1,PAPA.TDP43-G_S7.6392.4,FBXO38-DT,ENSG00000247199.6
7,chr5,148377588,148378845,-,ENSG00000247199.6_1,PAPA.Cont-C_S3.6203.3,FBXO38-DT,ENSG00000247199.6
8,chr5,148377695,148378845,-,ENSG00000247199.6_1,PAPA.Cont-D_S4.6266.4,FBXO38-DT,ENSG00000247199.6
9,chr5,148378366,148378845,-,ENSG00000247199.6_1,PAPA.TDP43-E_S5.6788.4,FBXO38-DT,ENSG00000247199.6


In [15]:
%%time
# read in reference GTF (used to define last exons), subsetting only to cryptic-containing genes to save memory
ref_gtf = pr.read_gtf("data/reference_filtered.gtf").subset(lambda df: df.gene_name.isin(set(bleeds_y.gene_name)))
ref_gtf

CPU times: user 29.4 s, sys: 4.94 s, total: 34.3 s
Wall time: 1min 6s


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,hgnc_id,protein_id,ccdsid
0,chr1,HAVANA,transcript,21440127,21483467,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,,,HGNC:25076,ENSP00000415711.2,CCDS57977.1
1,chr1,HAVANA,exon,21440127,21440348,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,1,ENSE00001546346.2,HGNC:25076,ENSP00000415711.2,CCDS57977.1
2,chr1,HAVANA,exon,21444947,21445219,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
3,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,start_codon,21445086,21445089,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,chrX,HAVANA,exon,131755597,131755746,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,9,ENSE00001752569.1,HGNC:49627,,
1416,chrX,HAVANA,exon,131749305,131749458,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,10,ENSE00001674554.1,HGNC:49627,,
1417,chrX,HAVANA,exon,131711650,131711720,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,11,ENSE00001594498.1,HGNC:49627,,
1418,chrX,HAVANA,exon,131709497,131709556,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,12,ENSE00001734033.1,HGNC:49627,,


In [16]:
# extract CDS annotations
ref_cds = ref_gtf.subset(lambda df: df.Feature == "CDS")
ref_cds

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,hgnc_id,protein_id,ccdsid
0,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
1,chr1,HAVANA,CDS,21470631,21470734,.,+,2,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,3,ENSE00003734729.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
2,chr1,HAVANA,CDS,21471568,21471783,.,+,1,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,4,ENSE00003723776.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
3,chr1,HAVANA,CDS,21472842,21472915,.,+,2,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,5,ENSE00003753961.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,CDS,21473379,21473585,.,+,1,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,6,ENSE00003747501.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,chrX,HAVANA,CDS,91876780,91879273,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-207,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359297.1,2,ENSE00003757792.1,HGNC:8656,ENSP00000384758.1,CCDS55459.1
422,chrX,HAVANA,CDS,92201374,92201455,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-207,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359297.1,3,ENSE00001300322.1,HGNC:8656,ENSP00000384758.1,CCDS55459.1
423,chrX,HAVANA,CDS,92263113,92263143,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-207,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359297.1,4,ENSE00001327755.1,HGNC:8656,ENSP00000384758.1,CCDS55459.1
424,chrX,HAVANA,CDS,92387734,92387933,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-207,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359297.1,5,ENSE00001718872.1,HGNC:8656,ENSP00000384758.1,CCDS55459.1


In [17]:
# double check ref_cds has a strand aware exon_number/region_number
ref_cds.columns


Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_type', 'gene_name', 'level', 'havana_gene',
       'transcript_id', 'transcript_type', 'transcript_name',
       'transcript_support_level', 'tag', 'havana_transcript', 'exon_number',
       'exon_id', 'hgnc_id', 'protein_id', 'ccdsid'],
      dtype='object')

In [18]:
ref_cds = ref_cds[["Frame","gene_id", "gene_name", "transcript_id", "exon_number", "Feature"]]
ref_cds

Unnamed: 0,Chromosome,Frame,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,Feature
0,chr1,0,21445086,21445219,+,ENSG00000142794.19,NBPF3,ENST00000454000.6,2,CDS
1,chr1,2,21470631,21470734,+,ENSG00000142794.19,NBPF3,ENST00000454000.6,3,CDS
2,chr1,1,21471568,21471783,+,ENSG00000142794.19,NBPF3,ENST00000454000.6,4,CDS
3,chr1,2,21472842,21472915,+,ENSG00000142794.19,NBPF3,ENST00000454000.6,5,CDS
4,chr1,1,21473379,21473585,+,ENSG00000142794.19,NBPF3,ENST00000454000.6,6,CDS
...,...,...,...,...,...,...,...,...,...,...
421,chrX,0,91876780,91879273,+,ENSG00000102290.23,PCDH11X,ENST00000406881.3,2,CDS
422,chrX,0,92201374,92201455,+,ENSG00000102290.23,PCDH11X,ENST00000406881.3,3,CDS
423,chrX,0,92263113,92263143,+,ENSG00000102290.23,PCDH11X,ENST00000406881.3,4,CDS
424,chrX,0,92387734,92387933,+,ENSG00000102290.23,PCDH11X,ENST00000406881.3,5,CDS


In [19]:
# step 1 - find the annotated CDS regions that correspond to the bleedthrough alst exon
# Definition requires that 5'ends exactly match
# filter for exact 3'ends, then update the 3'end position of the annotated CDS object to the bleedthrough 3'end

ref_cds_bld = (ref_cds.join(full_bleeds_le_mv, strandedness="same",suffix="_bl")
 .subset(lambda df: ((df.Strand == "+") & (df.Start == df["Start" + "_bl"])) | ((df.Strand == "-") & (df.End == df["End" + "_bl"]))))

ref_cds_bld


Unnamed: 0,Chromosome,Frame,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,Feature,Start_bl,End_bl,Strand_bl,le_id,transcript_id_bl,ref_gene_name,ref_gene_id
0,chr1,0,84563256,84563418,-,ENSG00000117151.13,CTBS,ENST00000370630.6,6,CDS,84561927,84563418,-,ENSG00000117151.13_1,PAPA.ctrl_ctrl_1.1091.9,CTBS,ENSG00000117151.13
1,chr2,2,225651431,225651535,+,ENSG00000144460.13,NYAP2,ENST00000272907.8,7,CDS,225651431,225658565,+,ENSG00000144460.13_2,PAPA.TDP43_ctrl_3.4451.2,NYAP2,ENSG00000144460.13
2,chr2,2,197631010,197631220,-,ENSG00000162944.11,RFTN2,ENST00000295049.9,5,CDS,197630416,197631220,-,ENSG00000162944.11_1,PAPA.TDP43_ctrl_4.4119.3,RFTN2,ENSG00000162944.11
3,chr2,2,197631010,197631220,-,ENSG00000162944.11,RFTN2,ENST00000295049.9,5,CDS,197630635,197631220,-,ENSG00000162944.11_1,PAPA.TDP-4.4038.5,RFTN2,ENSG00000162944.11
4,chr6,1,42935573,42935670,+,ENSG00000137161.18,CNPY3,ENST00000372836.5,3,CDS,42935573,42936378,+,ENSG00000137161.18_1,PAPA.TDP-4.8982.1,CNPY3,ENSG00000137161.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,chr19,0,16854142,16854261,+,ENSG00000127511.10,SIN3B,ENST00000248054.10,8,CDS,16854142,16857420,+,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,SIN3B,ENSG00000127511.10
58,chr19,0,16854142,16854262,+,ENSG00000127511.10,SIN3B,ENST00000596802.5,8,CDS,16854142,16857420,+,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,SIN3B,ENSG00000127511.10
59,chr19,0,16854142,16854261,+,ENSG00000127511.10,SIN3B,ENST00000379803.5,8,CDS,16854142,16857420,+,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,SIN3B,ENSG00000127511.10
60,chr19,1,32639355,32639488,-,ENSG00000105186.16,ANKRD27,ENST00000306065.9,12,CDS,32635458,32639488,-,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,ANKRD27,ENSG00000105186.16


In [20]:
# Swap Start and End coordinates to the bleedthrough coordinates, then drop remaining joined columns
# i.e. the 5'end coordinate remains the same, but update the 3'end to the end of the bleedthrough event
ref_cds_bld_swap = ref_cds_bld.new_position("swap").drop(like="^ref|bl$")
ref_cds_bld_swap

Unnamed: 0,Chromosome,Frame,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,Feature,le_id
0,chr1,0,84561927,84563418,-,ENSG00000117151.13,CTBS,ENST00000370630.6,6,CDS,ENSG00000117151.13_1
1,chr2,2,225651431,225658565,+,ENSG00000144460.13,NYAP2,ENST00000272907.8,7,CDS,ENSG00000144460.13_2
2,chr2,2,197630416,197631220,-,ENSG00000162944.11,RFTN2,ENST00000295049.9,5,CDS,ENSG00000162944.11_1
3,chr2,2,197630635,197631220,-,ENSG00000162944.11,RFTN2,ENST00000295049.9,5,CDS,ENSG00000162944.11_1
4,chr6,1,42935573,42936378,+,ENSG00000137161.18,CNPY3,ENST00000372836.5,3,CDS,ENSG00000137161.18_1
...,...,...,...,...,...,...,...,...,...,...,...
57,chr19,0,16854142,16857420,+,ENSG00000127511.10,SIN3B,ENST00000248054.10,8,CDS,ENSG00000127511.10_2
58,chr19,0,16854142,16857420,+,ENSG00000127511.10,SIN3B,ENST00000596802.5,8,CDS,ENSG00000127511.10_2
59,chr19,0,16854142,16857420,+,ENSG00000127511.10,SIN3B,ENST00000379803.5,8,CDS,ENSG00000127511.10_2
60,chr19,1,32635458,32639488,-,ENSG00000105186.16,ANKRD27,ENST00000306065.9,12,CDS,ENSG00000105186.16_3


In [21]:
# construct a 'cryptic transcript ID' - combine annotated transcript with the le_id
cryp_txid = ref_cds_bld_swap.as_df()[["gene_id", "gene_name", "transcript_id", "le_id", "exon_number"]].rename(columns={"exon_number": "exon_number_le"})
cryp_txid["cryptic_transcript_id"] = cryp_txid["transcript_id"].str.cat(cryp_txid["le_id"], sep=";")
cryp_txid.drop_duplicates(inplace=True)
cryp_txid

Unnamed: 0,gene_id,gene_name,transcript_id,le_id,exon_number_le,cryptic_transcript_id
0,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1
1,ENSG00000144460.13,NYAP2,ENST00000272907.8,ENSG00000144460.13_2,7,ENST00000272907.8;ENSG00000144460.13_2
2,ENSG00000162944.11,RFTN2,ENST00000295049.9,ENSG00000162944.11_1,5,ENST00000295049.9;ENSG00000162944.11_1
4,ENSG00000137161.18,CNPY3,ENST00000372836.5,ENSG00000137161.18_1,3,ENST00000372836.5;ENSG00000137161.18_1
11,ENSG00000158604.15,TMED4,ENST00000457408.7,ENSG00000158604.15_1,4,ENST00000457408.7;ENSG00000158604.15_1
13,ENSG00000158604.15,TMED4,ENST00000481238.1,ENSG00000158604.15_1,4,ENST00000481238.1;ENSG00000158604.15_1
15,ENSG00000155897.10,ADCY8,ENST00000286355.10,ENSG00000155897.10_1,7,ENST00000286355.10;ENSG00000155897.10_1
19,ENSG00000155897.10,ADCY8,ENST00000377928.7,ENSG00000155897.10_1,7,ENST00000377928.7;ENSG00000155897.10_1
23,ENSG00000119392.16,GLE1,ENST00000309971.9,ENSG00000119392.16_1,3,ENST00000309971.9;ENSG00000119392.16_1
26,ENSG00000119392.16,GLE1,ENST00000372770.4,ENSG00000119392.16_1,3,ENST00000372770.4;ENSG00000119392.16_1


In [22]:
# expand cryp_txid such that n*exon_number_le rows repeated for each cryptic_tx_id
# and label these with exon number 1..n
# Essentially, this gives you a 'truncated' transcript's exon structure, so can pull out the CDS annotations up to and including the bleedthrough
# --> peptide sequence for cryptic containing transcript

# generate n copies of rows for each cryptic-matching transcript
trnc_cryp_tx = cryp_txid.drop_duplicates().loc[lambda df: df.index.repeat(df["exon_number_le"])]
# add exon number from 1..n (n = exon_number_le)
trnc_cryp_tx["exon_number"] = trnc_cryp_tx.groupby('cryptic_transcript_id').cumcount() + 1
trnc_cryp_tx[trnc_cryp_tx.gene_name == "SIN3B"]

Unnamed: 0,gene_id,gene_name,transcript_id,le_id,exon_number_le,cryptic_transcript_id,exon_number
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,1
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,2
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,3
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,4
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,5
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,6
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,7
57,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2,8
58,ENSG00000127511.10,SIN3B,ENST00000596802.5,ENSG00000127511.10_2,8,ENST00000596802.5;ENSG00000127511.10_2,1
58,ENSG00000127511.10,SIN3B,ENST00000596802.5,ENSG00000127511.10_2,8,ENST00000596802.5;ENSG00000127511.10_2,2


In [23]:
# construct a gr of CDS objects
# should be all annotated cds

In [24]:
# construct a gr of CDSs up until bleedthrough last exon
trnc_cryp_tx = trnc_cryp_tx.astype({"exon_number": str, "exon_number_le": str})

# first, extract CDS coordinates for the bleedthrough last exon
trnc_cryp_tx_bl = trnc_cryp_tx.loc[lambda df: df["exon_number"] == df["exon_number_le"],:]

# now join in CDS coordinates fro bleedthrough for each truncated transcript
trnc_cds_bl = trnc_cryp_tx_bl.merge(ref_cds_bld_swap.as_df().astype({"exon_number": str}).drop(columns=["gene_id", "gene_name"]),
                      on=["transcript_id", "le_id", "exon_number"],
                      how="left")

# make sure all had a CDS/coordinates joined
assert len(trnc_cds_bl.loc[lambda df: df["Chromosome"].isna(), :]) == 0

# repeat for all CDS exons before the bleedthrough
trnc_cryp_tx_bf = trnc_cryp_tx.loc[lambda df: df["exon_number"] != df["exon_number_le"],:]
# join with annotated CDS to get coordinates
trnc_cds_bf = trnc_cryp_tx_bf.merge(ref_cds.as_df().astype({"exon_number": str}).drop(columns=["gene_id", "gene_name"]),
                      on=["exon_number", "transcript_id"],
                      ) # since some exons may not contain CDS, only join where matches (at true start of CDS)

# make sure all had a CDS/coordinates joined
try:
    assert len(trnc_cds_bf.loc[lambda df: df["Chromosome"].isna(), :]) == 0
except AssertionError:
    print(trnc_cds_bf.loc[lambda df: df["Chromosome"].isna(), :])

# # # combine the two dfs and convert to pyranges object
trnc_cds_comb = pd.concat([trnc_cds_bl, trnc_cds_bf])
new_order = trnc_cds_comb.columns.tolist()[-5:] + trnc_cds_comb.columns.tolist()[:-5]
trnc_cds_comb = trnc_cds_comb[new_order]
trnc_cds_comb_gr = pr.PyRanges(trnc_cds_comb).sort()

trnc_cds_comb_gr

Unnamed: 0,Frame,Start,End,Strand,Feature,gene_id,gene_name,transcript_id,le_id,exon_number_le,cryptic_transcript_id,exon_number,Chromosome
0,0,84561927,84563418,-,CDS,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1,6,chr1
1,2,84563734,84563832,-,CDS,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1,5,chr1
2,0,84565840,84566012,-,CDS,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1,4,chr1
3,2,84569930,84570139,-,CDS,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1,3,chr1
4,0,84570581,84570720,-,CDS,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1,2,chr1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,0,32646458,32646615,-,CDS,ENSG00000105186.16,ANKRD27,ENST00000587352.5,ENSG00000105186.16_3,12,ENST00000587352.5;ENSG00000105186.16_3,4,chr19
172,0,32649681,32649792,-,CDS,ENSG00000105186.16,ANKRD27,ENST00000587352.5,ENSG00000105186.16_3,12,ENST00000587352.5;ENSG00000105186.16_3,3,chr19
173,0,32649681,32649792,-,CDS,ENSG00000105186.16,ANKRD27,ENST00000306065.9,ENSG00000105186.16_3,12,ENST00000306065.9;ENSG00000105186.16_3,3,chr19
174,0,32658913,32659015,-,CDS,ENSG00000105186.16,ANKRD27,ENST00000306065.9,ENSG00000105186.16_3,12,ENST00000306065.9;ENSG00000105186.16_3,2,chr19


In [25]:
# translate truncated transcripts
# Note: exon number col must be strand aware and int dtype to allow proper sorting
trnc_cds_comb_gr = trnc_cds_comb_gr.apply(lambda df: df.astype({"exon_number": int}))


trnc_cds_peptides = translate_tx(trnc_cds_comb_gr, "cryptic_transcript_id", add_stop=True)
# return additional annotation information
trnc_cds_peptides = trnc_cds_peptides.merge(cryp_txid, on="cryptic_transcript_id")
trnc_cds_peptides




Unnamed: 0,cryptic_transcript_id,exon_number,Frame,peptide_seq,gene_id,gene_name,transcript_id,le_id,exon_number_le
0,ENST00000219689.12;ENSG00000103404.15_1,1,0,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...,ENSG00000103404.15,USP31,ENST00000219689.12,ENSG00000103404.15_1,13
1,ENST00000248054.10;ENSG00000127511.10_2,1,0,MAHAGGGSGGSGAGGPAGRGLSGARWGRSGSAGHEKLPVHVEDALT...,ENSG00000127511.10,SIN3B,ENST00000248054.10,ENSG00000127511.10_2,8
2,ENST00000262127.7;ENSG00000101624.11_3,1,0,MSLPPEKASELKQLIHQQLSKMDVHGRIREILAETIREELAPDQQH...,ENSG00000101624.11,CEP76,ENST00000262127.7,ENSG00000101624.11_3,12
3,ENST00000272907.8;ENSG00000144460.13_2,3,0,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...,ENSG00000144460.13,NYAP2,ENST00000272907.8,ENSG00000144460.13_2,7
4,ENST00000286355.10;ENSG00000155897.10_1,1,0,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...,ENSG00000155897.10,ADCY8,ENST00000286355.10,ENSG00000155897.10_1,7
5,ENST00000295049.9;ENSG00000162944.11_1,1,0,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...,ENSG00000162944.11,RFTN2,ENST00000295049.9,ENSG00000162944.11_1,5
6,ENST00000306065.9;ENSG00000105186.16_3,2,0,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,ENSG00000105186.16,ANKRD27,ENST00000306065.9,ENSG00000105186.16_3,12
7,ENST00000309971.9;ENSG00000119392.16_1,1,0,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,ENSG00000119392.16,GLE1,ENST00000309971.9,ENSG00000119392.16_1,3
8,ENST00000370630.6;ENSG00000117151.13_1,1,0,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,ENSG00000117151.13,CTBS,ENST00000370630.6,ENSG00000117151.13_1,6
9,ENST00000372770.4;ENSG00000119392.16_1,1,0,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,ENSG00000119392.16,GLE1,ENST00000372770.4,ENSG00000119392.16_1,3


In [26]:
# translate the annotated transcript
ref_cds = ref_cds.apply(lambda df: df.astype({"exon_number": int}))
ref_cds_peptides = translate_tx(ref_cds.subset(lambda df: df.transcript_id.isin(cryp_txid["transcript_id"])), "transcript_id", add_stop=True)
# return additional annotation information
ref_cds_peptides = ref_cds_peptides.merge(cryp_txid, on="transcript_id")
ref_cds_peptides

Unnamed: 0,transcript_id,exon_number,Frame,peptide_seq,gene_id,gene_name,le_id,exon_number_le,cryptic_transcript_id
0,ENST00000219689.12,1,0,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...,ENSG00000103404.15,USP31,ENSG00000103404.15_1,13,ENST00000219689.12;ENSG00000103404.15_1
1,ENST00000248054.10,1,0,MAHAGGGSGGSGAGGPAGRGLSGARWGRSGSAGHEKLPVHVEDALT...,ENSG00000127511.10,SIN3B,ENSG00000127511.10_2,8,ENST00000248054.10;ENSG00000127511.10_2
2,ENST00000262127.7,1,0,MSLPPEKASELKQLIHQQLSKMDVHGRIREILAETIREELAPDQQH...,ENSG00000101624.11,CEP76,ENSG00000101624.11_3,12,ENST00000262127.7;ENSG00000101624.11_3
3,ENST00000272907.8,3,0,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...,ENSG00000144460.13,NYAP2,ENSG00000144460.13_2,7,ENST00000272907.8;ENSG00000144460.13_2
4,ENST00000286355.10,1,0,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...,ENSG00000155897.10,ADCY8,ENSG00000155897.10_1,7,ENST00000286355.10;ENSG00000155897.10_1
5,ENST00000295049.9,1,0,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...,ENSG00000162944.11,RFTN2,ENSG00000162944.11_1,5,ENST00000295049.9;ENSG00000162944.11_1
6,ENST00000306065.9,2,0,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,ENSG00000105186.16,ANKRD27,ENSG00000105186.16_3,12,ENST00000306065.9;ENSG00000105186.16_3
7,ENST00000309971.9,1,0,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,ENSG00000119392.16,GLE1,ENSG00000119392.16_1,3,ENST00000309971.9;ENSG00000119392.16_1
8,ENST00000370630.6,1,0,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,ENSG00000117151.13,CTBS,ENSG00000117151.13_1,6,ENST00000370630.6;ENSG00000117151.13_1
9,ENST00000372770.4,1,0,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,ENSG00000119392.16,GLE1,ENSG00000119392.16_1,3,ENST00000372770.4;ENSG00000119392.16_1


In [27]:
# Get a slice of the point where cryptic peptide and overlapping normal peptide match/align
pep_comb = trnc_cds_peptides[["cryptic_transcript_id","gene_name", "peptide_seq"]].merge(ref_cds_peptides[["cryptic_transcript_id", "peptide_seq"]], on="cryptic_transcript_id", suffixes=["_cryptic", "_full"])
pep_comb["longest_match_slice"] = pep_comb.apply(lambda row: longest_matching_substring(row["peptide_seq_cryptic"], row["peptide_seq_full"]) + 1, axis=1)


# slice the cruptic peptide to get unique region
pep_comb["peptide_seq_cryptic_uniq"] = pep_comb.apply(lambda row: row["peptide_seq_cryptic"][row["longest_match_slice"]:], axis=1)
out_order = ["cryptic_transcript_id", "gene_name", "peptide_seq_cryptic", "peptide_seq_full", "peptide_seq_cryptic_uniq", "longest_match_slice"]


pep_comb

Unnamed: 0,cryptic_transcript_id,gene_name,peptide_seq_cryptic,peptide_seq_full,longest_match_slice,peptide_seq_cryptic_uniq
0,ENST00000219689.12;ENSG00000103404.15_1,USP31,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...,725,GLQCERLRVCAGESFRRVCLWQGDLTLEPHFPSGRQCANCEGSFHP...
1,ENST00000248054.10;ENSG00000127511.10_2,SIN3B,MAHAGGGSGGSGAGGPAGRGLSGARWGRSGSAGHEKLPVHVEDALT...,MAHAGGGSGGSGAGGPAGRGLSGARWGRSGSAGHEKLPVHVEDALT...,353,*
2,ENST00000262127.7;ENSG00000101624.11_3,CEP76,MSLPPEKASELKQLIHQQLSKMDVHGRIREILAETIREELAPDQQH...,MSLPPEKASELKQLIHQQLSKMDVHGRIREILAETIREELAPDQQH...,660,
3,ENST00000272907.8;ENSG00000144460.13_2,NYAP2,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...,645,PSSLANRD*
4,ENST00000286355.10;ENSG00000155897.10_1,ADCY8,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...,637,VSPLFLLLLGHECMLVCISNPDTSEKTDSSLTFSPMNTIIANDQTI...
5,ENST00000295049.9;ENSG00000162944.11_1,RFTN2,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...,310,KFYVNDILYLRNLNLYQNQ*
6,ENST00000306065.9;ENSG00000105186.16_3,ANKRD27,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,372,VRSHPCPGLPLWASWFP*
7,ENST00000309971.9;ENSG00000119392.16_1,GLE1,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,144,VSEPMKEGSLDPASHI*
8,ENST00000370630.6;ENSG00000117151.13_1,CTBS,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,319,VRLFVSYEHLFY*
9,ENST00000372770.4;ENSG00000119392.16_1,GLE1,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,144,VSEPMKEGSLDPASHI*


In [28]:
# output to TSV
pep_comb[out_order].to_csv("2023-11-08_i3cortical_cryptic_bleedthrough_fullpeptides.tsv", sep="\t", header=True, index=False)

In [55]:
# outptu as FASTA
# Note that need a minimum sequence for this to be viable
pep_comb_fa = pep_comb.loc[pep_comb.peptide_seq_cryptic_uniq.str.len() > 1,:]

# construct header - transcript_id|gene_name|le_id|coords
# first add le_id
pep_comb_fa = pd.concat([pep_comb_fa,
                         pep_comb_fa.cryptic_transcript_id.str.split(";", expand=True).rename(columns={0: "ref_transcript_id", 1: "le_id"})],
                         axis="columns")

# create the representative bleedthrough coordinate string
bleeds_le_cryp_df = bleeds_le_cryp.as_df()
bleeds_le_cryp_df.loc[:, "position"] = bleeds_le_cryp_df["Chromosome"].str.cat(bleeds_le_cryp_df[["Start", "End", "Strand"]].astype(str), sep=":")
# bleeds_le_cryp_df

# now merge with rep bleedthrough coordinates
pep_comb_fa = pep_comb_fa.merge(bleeds_le_cryp_df[["le_id", "position"]], on="le_id", how="left")
pep_comb_fa.drop_duplicates(subset=["le_id", "peptide_seq_cryptic_uniq"], inplace=True)

# now to make final header - transcript_id|gene_name|le_id|coords
pep_comb_fa.loc[:, "fa_name"] = pep_comb_fa["ref_transcript_id"].str.cat(pep_comb_fa[["gene_name", "le_id", "position"]], sep="|")

# remove stop codon from seq string
pep_comb_fa.loc[:, "peptide_seq_cryptic_uniq_nostop"] = pep_comb_fa["peptide_seq_cryptic_uniq"].str.rstrip("*")
pep_comb_fa.loc[:, "peptide_seq_cryptic_nostop"] = pep_comb_fa["peptide_seq_cryptic"].str.rstrip("*")

pep_comb_fa


# pep_comb_fa.cryptic_transcript_id.str.split(";", expand=True)

Unnamed: 0,cryptic_transcript_id,gene_name,peptide_seq_cryptic,peptide_seq_full,longest_match_slice,peptide_seq_cryptic_uniq,ref_transcript_id,le_id,position,fa_name,peptide_seq_cryptic_uniq_nostop,peptide_seq_cryptic_nostop
0,ENST00000219689.12;ENSG00000103404.15_1,USP31,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...,725,GLQCERLRVCAGESFRRVCLWQGDLTLEPHFPSGRQCANCEGSFHP...,ENST00000219689.12,ENSG00000103404.15_1,chr16:23079754:23079945:-,ENST00000219689.12|USP31|ENSG00000103404.15_1|...,GLQCERLRVCAGESFRRVCLWQGDLTLEPHFPSGRQCANCEGSFHP...,MSKVTAPGSGPPAAASGKEKRSFSKRLFRSGRAGGGGAGGPGASGP...
1,ENST00000272907.8;ENSG00000144460.13_2,NYAP2,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...,645,PSSLANRD*,ENST00000272907.8,ENSG00000144460.13_2,chr2:225651535:225658565:+,ENST00000272907.8|NYAP2|ENSG00000144460.13_2|c...,PSSLANRD,MISSKMMSSNPEEDPLDTFLQYIEDMGMKAYDGLVIQNASDIAREN...
2,ENST00000286355.10;ENSG00000155897.10_1,ADCY8,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...,637,VSPLFLLLLGHECMLVCISNPDTSEKTDSSLTFSPMNTIIANDQTI...,ENST00000286355.10,ENSG00000155897.10_1,chr8:130901663:130903771:-,ENST00000286355.10|ADCY8|ENSG00000155897.10_1|...,VSPLFLLLLGHECMLVCISNPDTSEKTDSSLTFSPMNTIIANDQTI...,MELSDVRCLTGSEELYTIHPTPPAGDGRSASRPQRLLWQTAVRHIT...
3,ENST00000295049.9;ENSG00000162944.11_1,RFTN2,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...,310,KFYVNDILYLRNLNLYQNQ*,ENST00000295049.9,ENSG00000162944.11_1,chr2:197630635:197631010:-,ENST00000295049.9|RFTN2|ENSG00000162944.11_1|c...,KFYVNDILYLRNLNLYQNQ,MGCGLRKLEDPDDSSPGKIFSTLKRPQVETKTEFAYEYVLLDFTLQ...
4,ENST00000306065.9;ENSG00000105186.16_3,ANKRD27,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...,372,VRSHPCPGLPLWASWFP*,ENST00000306065.9,ENSG00000105186.16_3,chr19:32635458:32639355:-,ENST00000306065.9|ANKRD27|ENSG00000105186.16_3...,VRSHPCPGLPLWASWFP,MALYDEDLLKNPFYLALQKCRPDLCSKVAQIHGIVLVPCKGSLSSS...
5,ENST00000309971.9;ENSG00000119392.16_1,GLE1,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...,144,VSEPMKEGSLDPASHI*,ENST00000309971.9,ENSG00000119392.16_1,chr9:128515639:128516262:+,ENST00000309971.9|GLE1|ENSG00000119392.16_1|ch...,VSEPMKEGSLDPASHI,MPSEGRCWETLKALRSSDKGRLCYYRDWLLRREDVLEECMSLPKLS...
6,ENST00000370630.6;ENSG00000117151.13_1,CTBS,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...,319,VRLFVSYEHLFY*,ENST00000370630.6,ENSG00000117151.13_1,chr1:84561927:84563256:-,ENST00000370630.6|CTBS|ENSG00000117151.13_1|ch...,VRLFVSYEHLFY,MSRPQLRRWRLVSSPPSGVPGLALLALLALLALRLAAGTDCPCPEP...
8,ENST00000372836.5;ENSG00000137161.18_1,CNPY3,MDSMPEPASRCLLLLPLLLLLLLLLPAPELGPSQAGAEENDWVRLP...,MDSMPEPASRCLLLLPLLLLLLLLLPAPELGPSQAGAEENDWVRLP...,124,VGFGIVLHPLWGQACMYLSVSAGVSVI*,ENST00000372836.5,ENSG00000137161.18_1,chr6:42935670:42936378:+,ENST00000372836.5|CNPY3|ENSG00000137161.18_1|c...,VGFGIVLHPLWGQACMYLSVSAGVSVI,MDSMPEPASRCLLLLPLLLLLLLLLPAPELGPSQAGAEENDWVRLP...
10,ENST00000457408.7;ENSG00000158604.15_1,TMED4,MAGVGAGPLRAMGRQALLLLALCATGAQGLYFHIGETEKRCFIEEI...,MAGVGAGPLRAMGRQALLLLALCATGAQGLYFHIGETEKRCFIEEI...,178,ASAYLLVI*,ENST00000457408.7,ENSG00000158604.15_1,chr7:44579896:44581092:-,ENST00000457408.7|TMED4|ENSG00000158604.15_1|c...,ASAYLLVI,MAGVGAGPLRAMGRQALLLLALCATGAQGLYFHIGETEKRCFIEEI...
11,ENST00000525194.5;ENSG00000078124.13_1,ACER3,MAPAADREGYWGPTTSTLDWCEENYSVTWYIAEFCSFLPSSLKSNL...,MAPAADREGYWGPTTSTLDWCEENYSVTWYIAEFCSFLPSSLKSNL...,47,LVSTHQASFLVLGLWAGRGDIPVI*,ENST00000525194.5,ENSG00000078124.13_1,chr11:76868156:76869066:+,ENST00000525194.5|ACER3|ENSG00000078124.13_1|c...,LVSTHQASFLVLGLWAGRGDIPVI,MAPAADREGYWGPTTSTLDWCEENYSVTWYIAEFCSFLPSSLKSNL...


In [None]:
with open("2023-11-10_i3cortical_cryptic_bleedthrough.cryptic_uniq_peptide.fa", "w") as out:
    for _, df in pep_comb_fa.iterrows():
        out.write(f">{df['fa_name']}\n{df['peptide_seq_cryptic_uniq_nostop']}\n")


with open("2023-11-10_i3cortical_cryptic_bleedthrough.cryptic_full_peptide.fa", "w") as out:
    for _, df in pep_comb_fa.iterrows():
        out.write(f">{df['fa_name']}\n{df['peptide_seq_cryptic_nostop']}\n")

In [22]:
# which bleedthroughs are missing?
bleeds_missing_gn = set(bleeds_gn).difference(set(pep_comb.gene_name))
bleeds_missing_gn

{'FBXO38-DT', 'FIRRE', 'LIPA', 'NBPF3', 'PCDH11X'}

In [23]:
# are they annotated as protein coding at gene/transcript level?
ref_gtf.subset(lambda df: df.gene_name.isin(bleeds_missing_gn)).as_df()[["gene_name", "gene_type", "transcript_type"]].drop_duplicates()

Unnamed: 0,gene_name,gene_type,transcript_type
0,NBPF3,protein_coding,protein_coding
68,NBPF3,protein_coding,nonsense_mediated_decay
109,NBPF3,protein_coding,processed_transcript
202,FBXO38-DT,lncRNA,lncRNA
208,LIPA,protein_coding,protein_coding
277,LIPA,protein_coding,processed_transcript
290,PCDH11X,protein_coding,processed_transcript
297,PCDH11X,protein_coding,retained_intron
300,PCDH11X,protein_coding,protein_coding
398,FIRRE,lncRNA,lncRNA


In [24]:
# can exclude FIRRE & FBCO38-DT as they are lncRNAs. But NBPF3, LIPA & PCDH11X could all potentially be coding.
# for NBPF3, LIPA and PCDH11X, check if the exon that they extend is protein-coding/has an annotated CDS
bleeds_missing_gn_pc = ["NBPF3", "LIPA", "PCDH11X"]
ref_gtf_bleeds_missing_pc_ex = ref_gtf.subset(lambda df: (df.gene_name.isin(bleeds_missing_gn_pc)) & (df.Feature.isin(["CDS", "exon"])))[["Frame","gene_id", "gene_name", "transcript_id", "exon_number", "Feature", "transcript_type"]]


(ref_gtf_bleeds_missing_pc_ex.join(full_bleeds_le_mv, strandedness="same",suffix="_bl")
 .subset(lambda df: ((df.Strand == "+") & (df.Start == df["Start" + "_bl"])) | 
 ((df.Strand == "-") & (df.End == df["End" + "_bl"])))
 )

Unnamed: 0,Chromosome,Frame,Start,End,Strand,gene_id,gene_name,transcript_id,exon_number,Feature,transcript_type,Start_bl,End_bl,Strand_bl,le_id,transcript_id_bl,ref_gene_name,ref_gene_id
0,chr1,.,21453372,21454306,+,ENSG00000142794.19,NBPF3,ENST00000478653.6,3,exon,processed_transcript,21453372,21457150,+,ENSG00000142794.19_3,PAPA.TDP-1.345.1,NBPF3,ENSG00000142794.19
1,chr10,.,89335335,89335534,-,ENSG00000107798.18,LIPA,ENST00000489359.1,3,exon,processed_transcript,89330996,89335534,-,ENSG00000107798.18_2,PAPA.ctrl_ctrl_4.14569.8,LIPA,ENSG00000107798.18
2,chr10,.,89335335,89335534,-,ENSG00000107798.18,LIPA,ENST00000489359.1,3,exon,processed_transcript,89330996,89335534,-,ENSG00000107798.18_2,PAPA.TDP43_ctrl_4.14306.12,LIPA,ENSG00000107798.18
3,chr10,.,89335335,89335534,-,ENSG00000107798.18,LIPA,ENST00000489359.1,3,exon,processed_transcript,89330996,89335534,-,ENSG00000107798.18_2,PAPA.TDP-2.13552.8,LIPA,ENSG00000107798.18
4,chr10,.,89335335,89335534,-,ENSG00000107798.18,LIPA,ENST00000489359.1,3,exon,processed_transcript,89331008,89335534,-,ENSG00000107798.18_2,PAPA.TDP43_19065403_S23.13899.16,LIPA,ENSG00000107798.18
5,chr10,.,89335335,89335534,-,ENSG00000107798.18,LIPA,ENST00000489359.1,3,exon,processed_transcript,89332056,89335534,-,ENSG00000107798.18_2,PAPA.TDP-6.13417.11,LIPA,ENSG00000107798.18
6,chr10,.,89335335,89335534,-,ENSG00000107798.18,LIPA,ENST00000489359.1,3,exon,processed_transcript,89334602,89335534,-,ENSG00000107798.18_2,PAPA.TDP43-G_S7.10969.1,LIPA,ENSG00000107798.18
7,chrX,.,91882906,91884007,+,ENSG00000102290.23,PCDH11X,ENST00000298274.12,6,exon,processed_transcript,91882906,91891321,+,ENSG00000102290.23_3,PAPA.TDP43_ctrl_4.26532.4,PCDH11X,ENSG00000102290.23
