In [6]:
import pyranges as pr
import pandas as pd
from Bio.Seq import Seq



In [7]:
def get_terminal_regions(gr: pr.PyRanges,
                         feature_col: str = "Feature",
                         feature_key: str = "exon",
                         id_col: str = "transcript_id",
                         region_number_col: str = "exon_number",
                         which_region: str = "last",
                         filter_single: bool = False,
                         ):
    '''
    Return gr of last exons for each transcript_id
    In process, region_number_col will be converted to type 'int'

    i.e. for minus-strand - largest exon_number for transcript corresponds to FIRST EXON, not last
    Annotated (i.e. Ensembl) reported exon_numbers DO RESPECT STRAND (i.e. max always = last exon)

    if Do respect strand, put source = None (default)
    if Don't respect strand, put source = "stringtie" (i.e. plus strand = max, minus strand = min)

    which_region can be set to 'last' or 'first'
    '''

    assert which_region in ["first", "last"]
    assert region_number_col in gr.columns.tolist()
    assert feature_col in gr.columns.tolist()
    assert id_col in gr.columns.tolist()

    # Make sure only 'exon' features are in the gr
    assert gr.as_df()[feature_col].drop_duplicates().tolist() == [feature_key], "only {} entries should be present in gr".format(feature_key)

    # Make sure region_number_col is int
    try:
        mod_gr = (gr.assign(region_number_col,
                            lambda df: df[region_number_col].astype(float).astype(int),
                            nb_cpu=1)
                  )
    except KeyError:
        # Currently getting weird KeyError with assign for certain chromosome
        # Mostly non-std chrom names
        # No error if do '.<exon_number>' to assign, but this makes inflexible to colname
        # Also no error if gr -> df assign -> gr
        print("pr.assign returned KeyError. Converting {} to int via pandas df conversion".format(region_number_col))

        mod_gr = gr.as_df()
        mod_gr[region_number_col] = mod_gr[region_number_col].astype(float).astype(int)
        mod_gr = pr.PyRanges(mod_gr)


    # Make sure gr is sorted by transcript_id & 'region number' (ascending order so 1..n)
    mod_gr = mod_gr.apply(lambda df: df.sort_values(by=[id_col, region_number_col], ascending=True),
                          nb_cpu=1)


    # Filter out single-exon transcripts
    if filter_single:
        print("Filtering for multi-exon transcripts...")
        print("Before: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))

        # Setting to 'False' marks all duplicates as True (so keep these)
        mod_gr = mod_gr.subset(lambda df: df.duplicated(subset=[id_col], keep=False), nb_cpu=1)

        print("After: {}".format(len(set(mod_gr.as_df()[id_col].tolist()))))


    # 1 = first region of group regardless of strand
    # Pick last region entry by max region number for each transcript (id_col)
    # Pick first region entry by min region number for each transcript (id_col)

    # keep="last" sets last in ID to 'False' and all others true (negate to keep last only)
    # keep="first" sets first in ID to 'False'

    out_gr = mod_gr.subset(lambda df: ~(df.duplicated(subset=[id_col], keep=which_region)),
                                                          nb_cpu=1
                              )


    return out_gr



def get_internal_regions(gr: pr.PyRanges ,
                         feature_col: str = "Feature",
                         feature_key: str = "exon",
                         id_col: str ="transcript_id",
                         region_number_col: str = "exon_number",
                         ):
    '''
    Return gr of internal exons for each transcript_id
    In process, exon_number_col will be converted to type 'int'
    '''

    assert gr.as_df()[feature_col].drop_duplicates().tolist() == [feature_key], "only {} entries should be present in gr".format(feature_key)


    # Pull out exons, convert exon_number to int
    exons_gr = gr.assign(region_number_col,
                         lambda df: df[region_number_col].astype(float).astype("Int64"),
                         nb_cpu=1)

    # Make sure gr is sorted by transcript_id & 'region number' (ascending order so 1..n)
    exons_gr = exons_gr.apply(lambda df: df.sort_values(by=[id_col,
                                                            region_number_col
                                                            ],
                                                        ascending=True),
                              nb_cpu=1)

    # Filter out 1st + last exons for each ID
    # first exons for each transcript (.ne(1))
    # keep="last" sets last dup value to 'False' & all others True
    # This will filter out last exons

    out_gr = (exons_gr.subset(lambda df: (df[region_number_col].ne(1).astype(bool)) &
                     (df.duplicated(subset=["transcript_id"], keep="last")),
                     nb_cpu=1
                    )
             )

    return out_gr

In [66]:
def translate_exon(gr: pr.PyRanges, fasta: str = "data/GRCh38.primary_assembly.genome.fa", drop_seqs: bool = True, add_stop: bool = False) -> pr.PyRanges:
    '''Generate a peptide sequence for a continuous genomic interval (e.g. exon)

    Parameters
    ----------
    gr : pr.PyRanges
        _description_
    fasta : str, optional
        path to FASTA file of genome sequence, by default "data/GRCh38.primary_assembly.genome.fa"
    drop_seqs : bool, optional
        whether to drop columns containing extracted genomic sequence, by default True

    Returns
    -------
    pr.PyRanges
        input PyRanges object with added 'peptide_seq' column containing str of peptide sequences
    '''

    assert "Frame" in gr.columns
    assert "dna_seq" not in gr.columns

    if add_stop:
        stop_str = "*"
    else:
        stop_str = ""

    # extract genomic sequences for intervals, add as column
    seqs_gr = pr.get_sequence(gr, fasta)
    gr.dna_seq = seqs_gr

    # convert dna_seq to a Seq object to translate (BioPython)
    # remove nucleotides from start if necessary to get complete codon
    gr = gr.assign("seq_dna_seq",
                    lambda df: df.apply(lambda row: Seq(row["dna_seq"][int(row["Frame"]):]),
                                        axis=1)
                                        )
    
    # generate peptide sequence
    gr = gr.assign("peptide_seq",
                   lambda df: df["seq_dna_seq"].apply(lambda x: str(x.translate(to_stop=True)) + stop_str)
                   )

    if drop_seqs:
        return gr.drop(["dna_seq", "seq_dna_seq"])
    
    else:
        return gr
    
    
def translate_tx(gr: pr.PyRanges, grpby: str, fasta: str = "data/GRCh38.primary_assembly.genome.fa", drop_seqs: bool = True) -> pd.DataFrame:
    '''Generate a peptide sequence for a combination of intervals (e.g. transcripts)

    Parameters
    ----------
    gr : pr.PyRanges
        _description_
    grpby : str
        column containing group identifiers
    fasta : str, optional
        path to FASTA file of genome sequence, by default "data/GRCh38.primary_assembly.genome.fa"
    drop_seqs : bool, optional
        whether to drop columns containing extracted genomic sequence, by default True

    Returns
    -------
    pd.DataFrame
        _description_
    '''
    
    assert "Frame" in gr.columns
    assert "exon_number" in gr.columns
    
    # get the frame of the first interval of each group
    tx2frame = gr.as_df()[[grpby, "exon_number", "Frame"]].astype({"Frame": int}).sort_values(by=[grpby, "exon_number"]).drop_duplicates(subset=grpby, keep="first")

    # get a df of transcript sequence for each interval
    tr_seqs = pr.get_transcript_sequence(gr, group_by=grpby, path=fasta)

    # merge back in frame for each group
    tr_seqs = tr_seqs.merge(tx2frame, on=grpby, how="left")

    # convert sequences to Seq object, trimming from start of sequence depenedent on frame
    tr_seqs["seq"] = tr_seqs.apply(lambda row: Seq(row["Sequence"][row["Frame"]:]), axis=1)

    # generate peptide sequence
    tr_seqs["peptide_seq"] = tr_seqs["seq"].apply(lambda x: str(x.translate(to_stop=True)))
    
    if drop_seqs:
        return tr_seqs.drop(columns=["Sequence", "seq"])
    
    else:  
        return tr_seqs


In [9]:
# First define set of IDs & last exons for which to extract peptide seqs
bleeds = pd.read_csv("data/riboseq_manual_verification_of_i3_cortical_cryptic_bleedthroughs.tsv", sep="\t", usecols=["gene_name", "le_id", "exper_cryp", "event_manual_validation", "notes"])

bleeds_y = bleeds[bleeds["event_manual_validation"] == "yes"]
bleeds_y


Unnamed: 0,gene_name,le_id,exper_cryp,event_manual_validation,notes
0,ACER3,ENSG00000078124.13_1,"brown_i3_cortical,seddighi_i3_cortical",yes,"coverage quite low, but reads consistently up ..."
1,ADCY8,ENSG00000155897.10_1,humphrey_i3_cortical,yes,seddighi - event looks real but v lowly expres...
2,ANKRD27,ENSG00000105186.16_3,"brown_i3_cortical,humphrey_i3_cortical,seddigh...",yes,
9,CEP76,ENSG00000101624.11_3,brown_i3_cortical,yes,"looks like real 3'UTR, but unclear whether cry..."
10,CNPY3,ENSG00000137161.18_1,"humphrey_i3_cortical,seddighi_i3_cortical",yes,reads in cryptic also downstream of stops in a...
11,CTBS,ENSG00000117151.13_1,humphrey_i3_cortical,yes,"genuine bleedthrough, not particularly cryptic..."
12,DLEU7,ENSG00000186047.11_2,humphrey_i3_cortical,yes,
14,FBXO38-DT,ENSG00000247199.6_1,humphrey_i3_cortical,yes,very lowly expressed
15,FIRRE,ENSG00000213468.7_1,brown_i3_cortical,yes,lncRNA. Looks real but lowly expressed
17,HECW1,ENSG00000002746.15_3,brown_i3_cortical,yes,definitely upregulated but also basal IR. Even...


In [10]:
%%time
# read in reference GTF (used to define last exons), subsetting only to cryptic-containing genes to save memory
ref_gtf = pr.read_gtf("data/reference_filtered.gtf").subset(lambda df: df.gene_name.isin(set(bleeds_y.gene_name)))
ref_gtf

CPU times: user 21.8 s, sys: 4.84 s, total: 26.7 s
Wall time: 1min 9s


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,hgnc_id,protein_id,ccdsid
0,chr1,HAVANA,transcript,21440127,21483467,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,,,HGNC:25076,ENSP00000415711.2,CCDS57977.1
1,chr1,HAVANA,exon,21440127,21440348,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,1,ENSE00001546346.2,HGNC:25076,ENSP00000415711.2,CCDS57977.1
2,chr1,HAVANA,exon,21444947,21445219,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
3,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,start_codon,21445086,21445089,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,chrX,HAVANA,exon,131755597,131755746,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,9,ENSE00001752569.1,HGNC:49627,,
1537,chrX,HAVANA,exon,131749305,131749458,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,10,ENSE00001674554.1,HGNC:49627,,
1538,chrX,HAVANA,exon,131711650,131711720,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,11,ENSE00001594498.1,HGNC:49627,,
1539,chrX,HAVANA,exon,131709497,131709556,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,12,ENSE00001734033.1,HGNC:49627,,


In [11]:
# read in last exon references
# 'quant last exons', which is unique regions of last exons only, will first be used to define 'annotated bleedthroughs'
quant_le = pr.read_gtf("data/novel_ref_combined.quant.last_exons.gtf")
full_le = pr.read_gtf("data/novel_ref_combined.last_exons.gtf")

# subset to bleedthroughs only
quant_le_bld = quant_le.subset(lambda df: df["le_id"].isin(set(bleeds_y["le_id"])))
full_le_bld =  full_le.subset(lambda df: df["le_id"].isin(set(bleeds_y["le_id"])))
quant_le

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,24419640,24420128,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24419290,24419640,ENST00000003912.7,488.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,1.0,ENSG00000001461.17_1
1,chr1,.,exon,24421813,24422110,.,+,.,ENSG00000001461.17,NIPAL3,...,last,244196402441964024419640,244401712444017124433103,"ENST00000358028.8,ENST00000374399.9,ENST000000...","NULL,NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,2.0,ENSG00000001461.17_2
2,chr1,.,exon,24454053,24454824,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2445350424453504,2445613724456137,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,3.0,ENSG00000001461.17_3
3,chr1,.,exon,24464120,24466378,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24464025,24464120,ENST00000003912.7,2258.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,4.0,ENSG00000001461.17_4
4,chr1,.,exon,24468985,24472976,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2446412024464120,2446898524468985,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",last_exon_spliced,ENSG00000001461.17,NIPAL3,5.0,ENSG00000001461.17_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129925,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
129926,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
129927,chrY,.,exon,21880307,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
129928,chrY,.,exon,21038288,21039044,.,-,.,ENSG00000254488.1,ENSG00000254488,...,last,21039044,21042268,ENST00000527562.1,,last_exon_spliced,ENSG00000254488.1,ENSG00000254488,1.0,ENSG00000254488.1_1


## Step 1 - identify bleedthroughs that have an overlapping/annotated CDS

In some cases, bleedthrough events are defined by the pipeline as novel extensions of annotated last exons (e.g. SIN3B). These events will already have a CDS annotated for the coding region of the last exon, so can use that to define the cryptic peptide

Strategy:
1. Overlap 'quantification' last exons with CDSs, checking that the parent transcripts match
2. Keep these CDSs for these bleedthroughs

In [12]:
# CDS and stop codons are annotated separately in GTFs - still want to report full coding region for bleedthrough including stop codon
# merge CDS & stop codons into single interval by transcript ID
ref_gtf_cds = ref_gtf.subset(lambda df: df.Feature == "CDS")
ref_gtf_sc = ref_gtf.subset(lambda df: df.Feature == "stop_codon")
# ref_gtf_cds_m = pr.concat([ref_gtf_cds, ref_gtf_sc]).merge(strand=True, by="transcript_id")
# ref_gtf_cds_m


In [13]:
bld_cds_anno = quant_le_bld[["ref_gene_name", "le_id", "transcript_id", "transcript_id_ref"]].join(ref_gtf_cds[["gene_name", "gene_id", "transcript_id", "Frame"]], strandedness="same", suffix="_cds")
bld_cds_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_cds,End_cds,Strand_cds,gene_name,gene_id,transcript_id_cds,Frame
0,chr7,44579896,44581092,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44581068,44581239,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
1,chr7,44579903,44581092,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44581068,44581239,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
2,chr13,50814965,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50823316,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
3,chr13,50815080,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50823316,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
4,chr18,12672624,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
5,chr18,12672624,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
6,chr18,12673096,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
7,chr18,12673096,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
8,chr19,16854261,16857420,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854142,16854262,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0
9,chr19,32635458,32639355,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32639304,32639488,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1


In [14]:
# Make the CDS coordinates the main coordinate range
cds_bld_anno = bld_cds_anno.new_position("swap").apply(lambda df: df.rename(columns={"Start_cds": "Start_le", "End_cds": "End_le", "Strand_cds": "Strand_le"}))
# as CDS entries lack the stop codon, extend by 3nt in each case to add in stop codon
cds_bld_anno = cds_bld_anno.extend({"3": 3})
cds_bld_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1


In [15]:
# Extract sequence for CDS sequences
seqs_cds_bld_anno = pr.get_sequence(cds_bld_anno, "data/GRCh38.primary_assembly.genome.fa")
cds_bld_anno.dna_seq = seqs_cds_bld_anno

cds_bld_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame,dna_seq
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0,GTCCGCCGGGTGCTGAAGAGCCAGGAGGTGTATGAAAACTTCCTCC...
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1,GATGGCAAATTTGAGTTACATCAAAAACTTCAGGTTTAGCAGCTTG...


### Translating CDSs

Minimal information needed is:
- DNA sequence
- Frame - how many bases to remove from start of sequence to get the first base of a codon

To translate all (or all but the first incomplete codon) codons, will need to ensure DNA sequence starts at a complete codon. From there on it should be a simple translation.
- Convert to biopython Seq object (stripping frame positions from start of dna_sequence)
- seq.translate()
- convert back to string

In [16]:
cds_bld_anno = cds_bld_anno.assign("seq_dna_seq",
                    lambda df: df.apply(lambda row: Seq(row["dna_seq"][int(row["Frame"]):]),
                                        axis=1)
                                        )

cds_bld_anno
                                                            

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame,dna_seq,seq_dna_seq
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ..."
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ..."
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ..."
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ..."
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0,GTCCGCCGGGTGCTGAAGAGCCAGGAGGTGTATGAAAACTTCCTCC...,"(G, T, C, C, G, C, C, G, G, G, T, G, C, T, G, ..."
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1,GATGGCAAATTTGAGTTACATCAAAAACTTCAGGTTTAGCAGCTTG...,"(A, T, G, G, C, A, A, A, T, T, T, G, A, G, T, ..."


In [17]:
cds_bld_anno.peptide_seq = cds_bld_anno.seq_dna_seq.apply(lambda x: str(x.translate()))
cds_bld_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame,dna_seq,seq_dna_seq,peptide_seq
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ...",RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ...",RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ...",DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANF...
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ...",DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANF...
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0,GTCCGCCGGGTGCTGAAGAGCCAGGAGGTGTATGAAAACTTCCTCC...,"(G, T, C, C, G, C, C, G, G, G, T, G, C, T, G, ...",VRRVLKSQEVYENFLRCIALFNQELVSGSELLQLVSPFLG*
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1,GATGGCAAATTTGAGTTACATCAAAAACTTCAGGTTTAGCAGCTTG...,"(A, T, G, G, C, A, A, A, T, T, T, G, A, G, T, ...",MANLSYIKNFRFSSLAKDELGYCLTSFEAAIEYIRQGSLSAKPPVR...


In [18]:
for _, df in cds_bld_anno.as_df().iterrows():
    print("\t".join(df[["gene_name", "peptide_seq"]]))

TMED4	RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQDYQRASAYLLVI*
TMED4	RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQDYQRASAYLLVI*
DLEU7	DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANFPSDAHMVACASLRQILQNLPDI*
DLEU7	DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANFPSDAHMVACASLRQILQNLPDI*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
SIN3B	VRRVLKSQEVYENFLRCIALFNQELVSGSELLQLVSPFLG*
ANKRD27	MANLSYIKNFRFSSLAKDELGYCLTSFEAAIEYIRQGSLSAKPPVRSHPCPGLPLWASWFP*




I manually checked the above and they are correct - now to define full cryptic peptides for those with no annotated CDS overlapping the unique region. To do this:
1. set of le_ids with overlapping/already defined peptides
2. For remaining les, take the full last exon sequences and find overlapping annotated CDSs. Make sure they exactly match at the 5'ends of the exon
    - This gives us the frame/phase for the exon 
3. Extract sequence and translate as before

## Translating last exons without an annotated CDS extending into unique region

In [19]:
# get full coordinates of bleedthough last exons without an annotated peptide 
full_le_bld_nol = full_le_bld.subset(lambda df: ~df["le_id"].isin(set(cds_bld_anno.le_id))).drop_duplicate_positions(strand=True)
full_le_bld_nol

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,,,,,,last_exon_extension,ENSG00000142794.19,NBPF3,3.0,ENSG00000142794.19_3
1,chr1,.,exon,84561927,84563418,.,-,.,PAPA.ctrl_ctrl_1.1091,,...,,,,,,internal_exon_extension,ENSG00000117151.13,CTBS,1.0,ENSG00000117151.13_1
2,chr2,.,exon,225651431,225658565,.,+,.,PAPA.TDP43_ctrl_3.4451,,...,,,,,,internal_exon_extension,ENSG00000144460.13,NYAP2,2.0,ENSG00000144460.13_2
3,chr2,.,exon,197630416,197631220,.,-,.,PAPA.TDP43_ctrl_4.4119,,...,,,,,,internal_exon_extension,ENSG00000162944.11,RFTN2,1.0,ENSG00000162944.11_1
4,chr2,.,exon,197630635,197631220,.,-,.,PAPA.TDP-4.4038,,...,,,,,,internal_exon_extension,ENSG00000162944.11,RFTN2,1.0,ENSG00000162944.11_1
5,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43_19065413_S19.7930,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
6,chr5,.,exon,148377695,148378845,.,-,.,PAPA.Cont-D_S4.6266,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
7,chr5,.,exon,148378366,148378845,.,-,.,PAPA.TDP43-E_S5.6788,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
8,chr6,.,exon,42935573,42936378,.,+,.,PAPA.TDP-4.8982,,...,,,,,,internal_exon_extension,ENSG00000137161.18,CNPY3,1.0,ENSG00000137161.18_1
9,chr7,.,exon,43444214,43449982,.,+,.,PAPA.TDP43-G_S7.7990,,...,,,,,,internal_exon_spliced,ENSG00000002746.15,"HECW1,HECW1",3.0,ENSG00000002746.15_3


In [20]:
# overlap join last exons with CDSs, filtering for exact 5'matches
cds_bld_anno_nol = (full_le_bld_nol[["gene_name", "le_id", "transcript_id", "transcript_id_ref", "ref_gene_name"]].join(ref_gtf_cds[["gene_name", "gene_id", "transcript_id", "Frame"]],
                                                                                                     strandedness="same",
                                                                                                     suffix="_cds")
.subset(lambda df: ((df.Strand == "+") & (df.Start == df.Start_cds)) | ((df.Strand == "-") & (df.End == df.End_cds))
        )
)

cds_bld_anno_nol

Unnamed: 0,Chromosome,Start,End,Strand,gene_name,le_id,transcript_id,transcript_id_ref,ref_gene_name,Start_cds,End_cds,Strand_cds,gene_name_cds,gene_id,transcript_id_cds,Frame
0,chr1,84561927,84563418,-,,ENSG00000117151.13_1,PAPA.ctrl_ctrl_1.1091.9,,CTBS,84563256,84563418,-,CTBS,ENSG00000117151.13,ENST00000370630.6,0
1,chr2,225651431,225658565,+,,ENSG00000144460.13_2,PAPA.TDP43_ctrl_3.4451.2,,NYAP2,225651431,225651535,+,NYAP2,ENSG00000144460.13,ENST00000272907.8,2
2,chr2,197630416,197631220,-,,ENSG00000162944.11_1,PAPA.TDP43_ctrl_4.4119.3,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
3,chr2,197630635,197631220,-,,ENSG00000162944.11_1,PAPA.TDP-4.4038.5,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
4,chr6,42935573,42936378,+,,ENSG00000137161.18_1,PAPA.TDP-4.8982.1,,CNPY3,42935573,42935670,+,CNPY3,ENSG00000137161.18,ENST00000372836.5,1
5,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000453890.5,2
6,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000395891.7,2
7,chr8,37766279,37766741,+,PLPBP,ENSG00000147471.12_1,ENST00000520073.5,ENST00000328195.8,PLPBP,37766279,37766355,+,PLPBP,ENSG00000147471.12,ENST00000328195.8,0
8,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000286355.10,1
9,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000377928.7,1


In [21]:
# overlap join last exons with CDSs, filtering for exact 5'matches
cds_bld_anno_nol = (full_le_bld_nol[["gene_name", "le_id", "transcript_id", "transcript_id_ref", "ref_gene_name"]].join(ref_gtf_cds[["gene_name", "gene_id", "transcript_id", "Frame"]],
                                                                                                     strandedness="same",
                                                                                                     suffix="_cds")
.subset(lambda df: ((df.Strand == "+") & (df.Start == df.Start_cds)) | ((df.Strand == "-") & (df.End == df.End_cds))
        )
)

cds_bld_anno_nol

Unnamed: 0,Chromosome,Start,End,Strand,gene_name,le_id,transcript_id,transcript_id_ref,ref_gene_name,Start_cds,End_cds,Strand_cds,gene_name_cds,gene_id,transcript_id_cds,Frame
0,chr1,84561927,84563418,-,,ENSG00000117151.13_1,PAPA.ctrl_ctrl_1.1091.9,,CTBS,84563256,84563418,-,CTBS,ENSG00000117151.13,ENST00000370630.6,0
1,chr2,225651431,225658565,+,,ENSG00000144460.13_2,PAPA.TDP43_ctrl_3.4451.2,,NYAP2,225651431,225651535,+,NYAP2,ENSG00000144460.13,ENST00000272907.8,2
2,chr2,197630416,197631220,-,,ENSG00000162944.11_1,PAPA.TDP43_ctrl_4.4119.3,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
3,chr2,197630635,197631220,-,,ENSG00000162944.11_1,PAPA.TDP-4.4038.5,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
4,chr6,42935573,42936378,+,,ENSG00000137161.18_1,PAPA.TDP-4.8982.1,,CNPY3,42935573,42935670,+,CNPY3,ENSG00000137161.18,ENST00000372836.5,1
5,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000453890.5,2
6,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000395891.7,2
7,chr8,37766279,37766741,+,PLPBP,ENSG00000147471.12_1,ENST00000520073.5,ENST00000328195.8,PLPBP,37766279,37766355,+,PLPBP,ENSG00000147471.12,ENST00000328195.8,0
8,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000286355.10,1
9,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000377928.7,1


In [22]:
# Now have full last exons with exactly matched annotated CDS (and frame to get complete CDS)
cds_bld_anno_nol = translate_exon(cds_bld_anno_nol)

for _, df in cds_bld_anno_nol.as_df().drop_duplicates(subset=["ref_gene_name", "peptide_seq"]).iterrows():
    print("\t".join(df[["ref_gene_name", "peptide_seq"]]))


CTBS	DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQRAPYYNYKVRLFVSYEHLFY
NYAP2	PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVPSSLANRD
RFTN2	SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLELTTFYYKQGLSLIDSFVFWETSKGKFYVNDILYLRNLNLYQNQ
CNPY3	DLRLIEVTETICKRLLDYSLHKERTGSNRFAKVGFGIVLHPLWGQACMYLSVSAGVSVI
HECW1	DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQLGEGSVPDGPGNQSIELSRPAEEAAVITEAGDQGMVSVGPEGAGELLAQVQKDIQPAPSAEELAEQLDLGEEASALLLEDGEAPASTKEEPLEEEATTQSRAGREEEEKEQEEEGDVSTLEQGEGRLQLRASVKRKSRPCSLPVSELETVIASACGDPETPRTHYIRIHTLLHSMPSAQGGSAAEEEDGAEEESTLKDSSEKDGLSEVDTVAADPSALEEDREEPEGATPGTAHPGHSGGHFPSLANGAAQDGDTHPSTGSESDSSPRQGGDHSCEGCDASCCSPSCYSSSCYSTSCYSSSCYSASCYSPSCYNGNRFASHTRFSSVDSAKISESTVFSSQDDEEEENSAFESVPDSMQSPELDPESTNGAGPWQDELAAPSGHVERSPEGLESPVAGPSNRREG
PLPBP	ILSLCPEIKWHFIGHLQKQNVNKLMGKIKLNMKTKLFCHCIASTTLCGRESNF
ADCY8	RIHISKATLDCLNGDYNVEEGHGKERNEFLRKHNIETYLIKQPEDSLLSLPEDIVKESVSSSDRRNSGATFTEGSWSPELPFDNIVGKQNVSPLFLLLLGHECMLVCISNPDTSEKTDSSLTFSPMNTIIANDQTISLSPQDRINLVCNFRKCQKCQRVPNTLSLSLFFFL
ACER3	SFLPSSLKSNLKLV



Manually checking a few:
- USP31 - There is a TSL5 transcript so can cross-check, is spot on! There is possibly a shortened one without a stop codon :O
- CNPY3 - also correct (refseq has extension partially annotated)
- ACER3 - also correct
- ADCY8 - also correct



In [23]:
# which ones are missing?
missing_bld_ids = set(full_le_bld_nol.subset(lambda df: ~df.le_id.isin(cds_bld_anno_nol.le_id)).ref_gene_name)
full_le_bld_nol.subset(lambda df: ~df.le_id.isin(cds_bld_anno_nol.le_id))

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,,,,,,last_exon_extension,ENSG00000142794.19,NBPF3,3.0,ENSG00000142794.19_3
1,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43_19065413_S19.7930,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
2,chr5,.,exon,148377695,148378845,.,-,.,PAPA.Cont-D_S4.6266,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
3,chr5,.,exon,148378366,148378845,.,-,.,PAPA.TDP43-E_S5.6788,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
4,chr10,.,exon,89330996,89335534,.,-,.,PAPA.ctrl_ctrl_4.14569,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
5,chr10,.,exon,89331008,89335534,.,-,.,PAPA.TDP43_19065403_S23.13899,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
6,chr10,.,exon,89332056,89335534,.,-,.,PAPA.TDP-6.13417,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
7,chr10,.,exon,89334602,89335534,.,-,.,PAPA.TDP43-G_S7.10969,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
8,chrX,.,exon,91882906,91891321,.,+,.,PAPA.TDP43_ctrl_4.26532,,...,,,,,,last_exon_extension,ENSG00000102290.23,PCDH11X,3.0,ENSG00000102290.23_3
9,chrX,.,exon,131823775,131825365,.,-,.,PAPA.TDP43-F_S6.20505,,...,,,,,,internal_exon_extension,ENSG00000213468.7,FIRRE,1.0,ENSG00000213468.7_1


In [24]:
# What are their transcript/gene types? If lncRNAs/other ncRNAs, can safely exclude and continue as before
ref_gtf.subset(lambda df: df["gene_name"].isin(missing_bld_ids)).as_df()[["gene_name", "gene_type", "transcript_type"]].drop_duplicates()

Unnamed: 0,gene_name,gene_type,transcript_type
0,NBPF3,protein_coding,protein_coding
68,NBPF3,protein_coding,nonsense_mediated_decay
109,NBPF3,protein_coding,processed_transcript
202,FBXO38-DT,lncRNA,lncRNA
208,LIPA,protein_coding,protein_coding
277,LIPA,protein_coding,processed_transcript
290,PCDH11X,protein_coding,processed_transcript
297,PCDH11X,protein_coding,retained_intron
300,PCDH11X,protein_coding,protein_coding
398,FIRRE,lncRNA,lncRNA


Still need to annotate/predict peptides for NBPF3, LIPA & PCDH11X. Could have been lacking because the exactly matching reference transcripts do not have an annotated CDS for that exon? In which case how can we predict peptides? let's see what they overlap with



In [25]:
# check if the missing IDs have any overlapping reference CDSs
(full_le_bld_nol.subset(lambda df: df.ref_gene_name.isin(missing_bld_ids))
 .join(ref_gtf_cds, strandedness="same", how="left")
 )

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number_b,exon_id,hgnc_id,protein_id,ccdsid
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43_19065413_S19.7930,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,chr5,.,exon,148377695,148378845,.,-,.,PAPA.Cont-D_S4.6266,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,chr5,.,exon,148378366,148378845,.,-,.,PAPA.TDP43-E_S5.6788,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,chr10,.,exon,89330996,89335534,.,-,.,PAPA.ctrl_ctrl_4.14569,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
5,chr10,.,exon,89331008,89335534,.,-,.,PAPA.TDP43_19065403_S23.13899,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
6,chr10,.,exon,89332056,89335534,.,-,.,PAPA.TDP-6.13417,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7,chr10,.,exon,89334602,89335534,.,-,.,PAPA.TDP43-G_S7.10969,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
8,chrX,.,exon,91882906,91891321,.,+,.,PAPA.TDP43_ctrl_4.26532,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9,chrX,.,exon,131823775,131825365,.,-,.,PAPA.TDP43-F_S6.20505,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [26]:
# Now check if any overlapping exons
(full_le_bld_nol.subset(lambda df: df.ref_gene_name.isin(missing_bld_ids - {"FIRRE", "FBXO38-DT"}))
 .join(ref_gtf.subset(lambda df: df.Feature == "exon")[["Feature", "gene_name", "transcript_id", "transcript_type"]],
       strandedness="same",
       suffix="_exon",
       how="left")
       # ensure exact matches at 5'end
 .subset(lambda df: ((df.Strand == "+") & (df.Start == df.Start_exon)) | ((df.Strand == "-") & (df.End == df.End_exon)))
 )

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,ref_gene_name,le_number,le_id,Feature_exon,Start_exon,End_exon,Strand_exon,gene_name_exon,transcript_id_exon,transcript_type
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,NBPF3,3.0,ENSG00000142794.19_3,exon,21453372,21454306,+,NBPF3,ENST00000478653.6,processed_transcript
1,chr10,.,exon,89330996,89335534,.,-,.,PAPA.ctrl_ctrl_4.14569,,...,LIPA,2.0,ENSG00000107798.18_2,exon,89335335,89335534,-,LIPA,ENST00000489359.1,processed_transcript
2,chr10,.,exon,89331008,89335534,.,-,.,PAPA.TDP43_19065403_S23.13899,,...,LIPA,2.0,ENSG00000107798.18_2,exon,89335335,89335534,-,LIPA,ENST00000489359.1,processed_transcript
3,chr10,.,exon,89332056,89335534,.,-,.,PAPA.TDP-6.13417,,...,LIPA,2.0,ENSG00000107798.18_2,exon,89335335,89335534,-,LIPA,ENST00000489359.1,processed_transcript
4,chr10,.,exon,89334602,89335534,.,-,.,PAPA.TDP43-G_S7.10969,,...,LIPA,2.0,ENSG00000107798.18_2,exon,89335335,89335534,-,LIPA,ENST00000489359.1,processed_transcript
5,chrX,.,exon,91882906,91891321,.,+,.,PAPA.TDP43_ctrl_4.26532,,...,PCDH11X,3.0,ENSG00000102290.23_3,exon,91882906,91884007,+,PCDH11X,ENST00000298274.12,processed_transcript


In [27]:
# 1. Caclulate frame for full bleedthroughs (3 - length % 3) - remainder of division by codon, then need to remove x from start of sequence to get a complete codon
# * Do I need a complete transcript?
# 2. Translate as before
full_le_bld_miss = full_le_bld_nol.subset(lambda df: df.ref_gene_name.isin(missing_bld_ids - {"FIRRE", "FBXO38-DT"}))[["Feature", "ref_gene_name", "ref_gene_id", "le_id"]]
full_le_bld_miss = full_le_bld_miss.assign("Frame",
                                           lambda df: 3 - ((df.End - df.Start) % 3)
                                           )


full_le_bld_miss

Unnamed: 0,Chromosome,Feature,Start,End,Strand,ref_gene_name,ref_gene_id,le_id,Frame
0,chr1,exon,21453372,21457150,+,NBPF3,ENSG00000142794.19,ENSG00000142794.19_3,2
1,chr10,exon,89330996,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,1
2,chr10,exon,89331008,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,1
3,chr10,exon,89332056,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,2
4,chr10,exon,89334602,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,1
5,chrX,exon,91882906,91891321,+,PCDH11X,ENSG00000102290.23,ENSG00000102290.23_3,3


In [28]:
translate_exon(full_le_bld_miss)



Unnamed: 0,Chromosome,Feature,Start,End,Strand,ref_gene_name,ref_gene_id,le_id,Frame,peptide_seq
0,chr1,exon,21453372,21457150,+,NBPF3,ENSG00000142794.19,ENSG00000142794.19_3,2,PIKEDPETFSLPRR
1,chr10,exon,89330996,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,1,RL
2,chr10,exon,89331008,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,1,RL
3,chr10,exon,89332056,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,2,GFDISMHQGSYQTKF
4,chr10,exon,89334602,89335534,-,LIPA,ENSG00000107798.18,ENSG00000107798.18_2,1,RL
5,chrX,exon,91882906,91891321,+,PCDH11X,ENSG00000102290.23,ENSG00000102290.23_3,3,DSRTSTIEICSEI


I think the above is not going to work... From manual inspection whilst it translates the exon correctly, the problem is you don't know the frame based on the transcript as a whole. In the suggested peptide seq, for e.g. LIPA, I can't even find a start codon in the corresponding frame... For now just assume that these cannot be translated given the known transcript structure?


### Extracting unique regions of peptides - translating shared internal plus downstream exon in CDS

1. Based on overlap/matching 5'ends of **internal exons**, assign cryptic to its parent transcript/CDS. Create a cryptic_tx_id - ref_tx;le_id;start;end for le_id, ref_tx_id mapping
    - Different les have different coordinates - want to ensure cryptic bleedthrough is inserted once per matching reference transcript.
    - need to duplicate different transcripts?
    left join CDS & cryptic last exons (checking matches for 5'end), if match assign a cryptic_overlap column value of 1
2. Assign cryptic_tx_id
3. Assign exon/region numbers for matching CDSs, sort by tx & region numbers (strand aware, 5'-3')
4. subset for cryptic overlap or cryptic overlap + 1 (in terms of index)
    - two steps - mask for integer locs of indexes of cryptic overlap, then create mask with ilocs + 1
5. spliced_subsequence by cryptic_tx_id, then translate sequence (calculating frame Or just subtract frame from the first exon/CDS?)

In [29]:
# bleedthroughs with prredicted peptide
full_le_bld_pep = full_le_bld.subset(lambda df: ~df.ref_gene_name.isin(missing_bld_ids))

# get all non-last exon CDS entries
ref_gtf_cds_f = get_terminal_regions(ref_gtf_cds, feature_key="CDS", which_region="first")
ref_gtf_cds_int = get_internal_regions(ref_gtf_cds, feature_key="CDS")
ref_gtf_cds_nl = pr.concat([ref_gtf_cds_f, ref_gtf_cds_int])

print(ref_gtf_cds_nl.dtypes)
ref_gtf_cds_nl




Chromosome                  category
Source                        object
Feature                       object
Start                          int64
End                            int64
Score                         object
Strand                      category
Frame                         object
gene_id                       object
gene_type                     object
gene_name                     object
level                         object
havana_gene                   object
transcript_id                 object
transcript_type               object
transcript_name               object
transcript_support_level      object
tag                           object
havana_transcript             object
exon_number                    Int64
exon_id                       object
hgnc_id                       object
protein_id                    object
ccdsid                        object
dtype: object


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,hgnc_id,protein_id,ccdsid
0,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,nonsense_mediated_decay,NBPF3-201,2.0,,OTTHUMT00000008190.3,2,ENSE00003642335.1,HGNC:25076,ENSP00000316739.7,
1,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-202,1.0,"basic,Ensembl_canonical,MANE_Select,appris_pri...",OTTHUMT00000476523.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000316782.5,CCDS216.1
2,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-203,2.0,"basic,CCDS",OTTHUMT00000008193.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000340336.5,CCDS57976.1
3,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,nonsense_mediated_decay,NBPF3-212,3.0,,OTTHUMT00000021355.2,2,ENSE00003642335.1,HGNC:25076,ENSP00000478530.1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,chrX,HAVANA,CDS,92387734,92387933,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-207,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359297.1,5,ENSE00001718872.1,HGNC:8656,ENSP00000384758.1,CCDS55459.1
419,chrX,HAVANA,CDS,91876780,91879273,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-208,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359295.2,2,ENSE00003757792.1,HGNC:8656,ENSP00000423762.1,CCDS55458.1
420,chrX,HAVANA,CDS,92201374,92201455,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-208,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359295.2,3,ENSE00001300322.1,HGNC:8656,ENSP00000423762.1,CCDS55458.1
421,chrX,HAVANA,CDS,92263113,92263143,.,+,0,ENSG00000102290.23,protein_coding,...,protein_coding,PCDH11X-208,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000359295.2,4,ENSE00001327755.1,HGNC:8656,ENSP00000423762.1,CCDS55458.1


In [45]:
# Find CDSs of internal exons that overlap with bleedthrough last exon and exactly match at 5' end
# Generate a dict of {tx_cryp_id: (ref_tx_id, olap_exon_number)}
# tx_cryp_id : ref_tx;le_id;start;end (unique for each interval for a last exon (some have different 3'ends))
# will need to add downstream exon for every matched tx_cryp_id (using exon number to extract next downstream)




# overlap CDSs with bleedthroughs, finding exact 5' end matches
# construct a master df of cryptic IDs plus associted transcript IDs, le_ids
cryp_id_df = (ref_gtf_cds_nl.join(full_le_bld_pep[["le_id", "ref_gene_name"]],
                  strandedness="same",
                  suffix="_le")
                  # filter for exact 5'end matches
.subset(lambda df: ((df.Strand == "+") & (df.Start == df.Start_le)) | ((df.Strand == "-") & (df.End == df.End_le)))
        # construct tx_cryp_id
        .assign("tx_cryp_id",
                lambda df: df["transcript_id"].str.cat(df[["le_id", "gene_name", "Start_le", "End_le"]].astype(str), sep=";"))
                                                
.as_df()[["transcript_id", "exon_number", "Frame", "gene_name", "le_id", "tx_cryp_id"]]
)

cryp_id_dict = cryp_id_df.drop_duplicates().set_index("tx_cryp_id").to_dict(orient="index")
cryp_id_dict

#

{'ENST00000370630.6;ENSG00000117151.13_1;CTBS;84561927;84563418': {'transcript_id': 'ENST00000370630.6',
  'exon_number': 6,
  'Frame': '0',
  'gene_name': 'CTBS',
  'le_id': 'ENSG00000117151.13_1'},
 'ENST00000272907.8;ENSG00000144460.13_2;NYAP2;225651431;225658565': {'transcript_id': 'ENST00000272907.8',
  'exon_number': 7,
  'Frame': '2',
  'gene_name': 'NYAP2',
  'le_id': 'ENSG00000144460.13_2'},
 'ENST00000295049.9;ENSG00000162944.11_1;RFTN2;197630416;197631220': {'transcript_id': 'ENST00000295049.9',
  'exon_number': 5,
  'Frame': '2',
  'gene_name': 'RFTN2',
  'le_id': 'ENSG00000162944.11_1'},
 'ENST00000295049.9;ENSG00000162944.11_1;RFTN2;197630635;197631220': {'transcript_id': 'ENST00000295049.9',
  'exon_number': 5,
  'Frame': '2',
  'gene_name': 'RFTN2',
  'le_id': 'ENSG00000162944.11_1'},
 'ENST00000372836.5;ENSG00000137161.18_1;CNPY3;42935573;42936378': {'transcript_id': 'ENST00000372836.5',
  'exon_number': 3,
  'Frame': '1',
  'gene_name': 'CNPY3',
  'le_id': 'ENSG000001

In [46]:
# Now extract next downstream exon for each bleedthrough matched first/internal CDS (1 copy for each cryp_tx_id)
# first, make sure exon number column is int dtype. Also return to using all CDSs, as downstream annotated exon could be a last exon/CDS 
ref_gtf_cds = ref_gtf_cds.assign("exon_number", lambda df: df.exon_number.astype(int))

# get downstream CDS for every bleedthrough entry
bld_cds_ds = pr.concat([ref_gtf_cds.subset(lambda df: (df.transcript_id == col_dict["transcript_id"]) & (df.exon_number == (col_dict["exon_number"] + 1))).assign("tx_cryp_id", lambda df: pd.Series([cryp_tx_id]*len(df), index=df.index)) 
 for cryp_tx_id, col_dict in cryp_id_dict.items()])
# get actual overlapping CDS for every bleedthrough entry
bld_cds_olap = pr.concat([ref_gtf_cds.subset(lambda df: (df.transcript_id == col_dict["transcript_id"]) & (df.exon_number == col_dict["exon_number"])).assign("tx_cryp_id", lambda df: pd.Series([cryp_tx_id]*len(df), index=df.index)) 
 for cryp_tx_id, col_dict in cryp_id_dict.items()])
# combine
bld_cds_olap_ds = pr.concat([bld_cds_olap, bld_cds_ds]).sort()

bld_cds_olap_ds

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,hgnc_id,protein_id,ccdsid,tx_cryp_id
0,chr1,HAVANA,CDS,84555001,84555199,.,-,0,ENSG00000117151.13,protein_coding,...,CTBS-202,1.0,"basic,Ensembl_canonical,MANE_Select,appris_pri...",OTTHUMT00000027457.3,7,ENSE00001823837.3,HGNC:2496,ENSP00000359664.4,CCDS698.1,ENST00000370630.6;ENSG00000117151.13_1;CTBS;84...
1,chr1,HAVANA,CDS,84563256,84563418,.,-,0,ENSG00000117151.13,protein_coding,...,CTBS-202,1.0,"basic,Ensembl_canonical,MANE_Select,appris_pri...",OTTHUMT00000027457.3,6,ENSE00003657005.1,HGNC:2496,ENSP00000359664.4,CCDS698.1,ENST00000370630.6;ENSG00000117151.13_1;CTBS;84...
2,chr2,HAVANA,CDS,225651431,225651535,.,+,2,ENSG00000144460.13,protein_coding,...,NYAP2-201,1.0,"CAGE_supported_TSS,inferred_transcript_model,R...",,7,ENSE00003965620.1,HGNC:29291,ENSP00000272907.7,,ENST00000272907.8;ENSG00000144460.13_2;NYAP2;2...
3,chr2,HAVANA,CDS,225698316,225698541,.,+,0,ENSG00000144460.13,protein_coding,...,NYAP2-201,1.0,"CAGE_supported_TSS,inferred_transcript_model,R...",,8,ENSE00003965621.1,HGNC:29291,ENSP00000272907.7,,ENST00000272907.8;ENSG00000144460.13_2;NYAP2;2...
4,chr2,HAVANA,CDS,197617799,197617921,.,-,2,ENSG00000162944.11,protein_coding,...,RFTN2-201,1.0,"basic,Ensembl_canonical,MANE_Select,appris_pri...",OTTHUMT00000256106.3,6,ENSE00001070067.1,HGNC:26402,ENSP00000295049.3,CCDS2323.1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,chr19,HAVANA,CDS,16854142,16854261,.,+,0,ENSG00000127511.10,protein_coding,...,SIN3B-202,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000462848.1,8,ENSE00000689616.1,HGNC:19354,ENSP00000369131.1,CCDS32946.1,ENST00000379803.5;ENSG00000127511.10_2;SIN3B;1...
68,chr19,HAVANA,CDS,16862351,16862559,.,+,1,ENSG00000127511.10,protein_coding,...,SIN3B-201,1.0,"basic,Ensembl_canonical,MANE_Select,appris_pri...",OTTHUMT00000462846.2,9,ENSE00000689615.1,HGNC:19354,ENSP00000248054.4,CCDS74308.1,ENST00000248054.10;ENSG00000127511.10_2;SIN3B;...
69,chr19,HAVANA,CDS,16862351,16862559,.,+,1,ENSG00000127511.10,protein_coding,...,SIN3B-202,1.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000462848.1,9,ENSE00000689615.1,HGNC:19354,ENSP00000369131.1,CCDS32946.1,ENST00000379803.5;ENSG00000127511.10_2;SIN3B;1...
70,chr19,HAVANA,CDS,32631401,32631494,.,-,0,ENSG00000105186.16,protein_coding,...,ANKRD27-201,1.0,"basic,Ensembl_canonical,MANE_Select,appris_pri...",OTTHUMT00000450329.2,13,ENSE00000821623.1,HGNC:25310,ENSP00000304292.3,CCDS32986.1,ENST00000306065.9;ENSG00000105186.16_3;ANKRD27...


In [50]:





# peptide sequences for matched internal exon CDS + downstream exon CDS
cds_bld_anno_skip = translate_tx(bld_cds_olap_ds, "tx_cryp_id")
cds_bld_anno_skip

cds_bld_anno_skip = cryp_id_df.drop_duplicates().merge(cds_bld_anno_skip[["tx_cryp_id", "peptide_seq"]], on="tx_cryp_id", how="left")
cds_bld_anno_skip



Unnamed: 0,transcript_id,exon_number,Frame,gene_name,le_id,tx_cryp_id,peptide_seq
0,ENST00000370630.6,6,0,CTBS,ENSG00000117151.13_1,ENST00000370630.6;ENSG00000117151.13_1;CTBS;84...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...
1,ENST00000272907.8,7,2,NYAP2,ENSG00000144460.13_2,ENST00000272907.8;ENSG00000144460.13_2;NYAP2;2...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVACMQWFHGDHT...
2,ENST00000295049.9,5,2,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...
3,ENST00000295049.9,5,2,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...
4,ENST00000372836.5,3,1,CNPY3,ENSG00000137161.18_1,ENST00000372836.5;ENSG00000137161.18_1;CNPY3;4...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKGMSETFETLHNLVH...
5,ENST00000395891.7,11,2,HECW1,ENSG00000002746.15_3,ENST00000395891.7;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...
6,ENST00000453890.5,10,2,HECW1,ENSG00000002746.15_3,ENST00000453890.5;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...
7,ENST00000457408.7,4,0,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
8,ENST00000457408.7,4,0,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
9,ENST00000328195.8,4,0,PLPBP,ENSG00000147471.12_1,ENST00000328195.8;ENSG00000147471.12_1;PLPBP;3...,ILSLCPEIKWHFIGHLQKQNVNKLMAVPNLFMLETVDSVKLADKVN...


In [65]:
# now want to create a final df combining full bleedthrough peptide sequence, the 'skip' peptide sequence, then the 'cryptic unique' peptide
cds_bld_anno_comb = pd.concat([cds_bld_anno_nol.as_df()[["le_id", "peptide_seq"]].drop_duplicates(),
                               cds_bld_anno.as_df()[["le_id", "peptide_seq"]].drop_duplicates()
                              ],
                              ignore_index=True)

peps_comb = cds_bld_anno_skip.merge(cds_bld_anno_comb, on="le_id", how="left", suffixes=[None, "_cryptic"])

peps_comb

Unnamed: 0,transcript_id,exon_number,Frame,gene_name,le_id,tx_cryp_id,peptide_seq,peptide_seq_cryptic
0,ENST00000370630.6,6,0,CTBS,ENSG00000117151.13_1,ENST00000370630.6;ENSG00000117151.13_1;CTBS;84...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...
1,ENST00000272907.8,7,2,NYAP2,ENSG00000144460.13_2,ENST00000272907.8;ENSG00000144460.13_2;NYAP2;2...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVACMQWFHGDHT...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVPSSLANRD
2,ENST00000295049.9,5,2,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...
3,ENST00000295049.9,5,2,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...
4,ENST00000372836.5,3,1,CNPY3,ENSG00000137161.18_1,ENST00000372836.5;ENSG00000137161.18_1;CNPY3;4...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKGMSETFETLHNLVH...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKVGFGIVLHPLWGQA...
5,ENST00000395891.7,11,2,HECW1,ENSG00000002746.15_3,ENST00000395891.7;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...
6,ENST00000453890.5,10,2,HECW1,ENSG00000002746.15_3,ENST00000453890.5;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...
7,ENST00000457408.7,4,0,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
8,ENST00000457408.7,4,0,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
9,ENST00000328195.8,4,0,PLPBP,ENSG00000147471.12_1,ENST00000328195.8;ENSG00000147471.12_1;PLPBP;3...,ILSLCPEIKWHFIGHLQKQNVNKLMAVPNLFMLETVDSVKLADKVN...,ILSLCPEIKWHFIGHLQKQNVNKLMGKIKLNMKTKLFCHCIASTTL...


In [70]:
# function to match two peptide sequences & find position where they stop matching (assuming they start at the same position)
def longest_matching_substring(str1: str, str2: str) -> int:
    '''Return the index of the final position in longest exactly matched substring from the beginning of two strings

    this is intended for strings that begin with identical values. it only checks the longest matching substring from the first position in each string. if there is one mismatch, the function terminates
    

    Parameters
    ----------
    str1 : str
        first string of pair wish to find the longest matching substring from the beginning of the string
    str2 : str
        second string of pair wish to find the longest matching substring from the beginning of the string

    Returns
    -------
    int
        index of the final position (in either string) of the longest matched substring between the two. If you wish to slice a string to retain/exclude the longest match, remember to add one to returned value
    '''
    longest_substring = 0 
    for i, (char1, char2) in enumerate(zip(str1, str2)):
        if char1 == char2:
            longest_substring = i
        else:
            break

    return longest_substring 

string1 = "abcdefg"
string2 = "abcxyz"

print("Testing with strings:", string1, string2)
print("Original function result:", longest_matching_substring(string1, string2))
print("Original function result extracting matched string1:", string1[:longest_matching_substring(string1, string2) + 1])
print("Original function result extracting unique part of string1:", string1[longest_matching_substring(string1, string2) + 1:])


Testing with strings: abcdefg abcxyz
Original function result: 2
Original function result extracting matched string1: abc
Original function result extracting unique part of string1: defg


# Final step - identifying unique-to-cryptic peptides



In [83]:
# Get a slice of the point where cryptic peptide and overlapping normal peptide match/align
peps_comb["longest_match_slice"] = peps_comb.apply(lambda row: longest_matching_substring(row["peptide_seq_cryptic"], row["peptide_seq"]) + 1,
                                                                   axis=1)

# slice the cruptic peptide to get unique region
peps_comb["peptide_seq_cryptic_uniq"] = peps_comb.apply(lambda row: row["peptide_seq_cryptic"][row["longest_match_slice"]:], axis=1)

peps_comb


Unnamed: 0,transcript_id,exon_number,Frame,gene_name,le_id,tx_cryp_id,peptide_seq,peptide_seq_cryptic,longest_match_slice,peptide_seq_cryptic_uniq
0,ENST00000370630.6,6,0,CTBS,ENSG00000117151.13_1,ENST00000370630.6;ENSG00000117151.13_1;CTBS;84...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...,54,VRLFVSYEHLFY
1,ENST00000272907.8,7,2,NYAP2,ENSG00000144460.13_2,ENST00000272907.8;ENSG00000144460.13_2;NYAP2;2...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVACMQWFHGDHT...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVPSSLANRD,35,PSSLANRD
2,ENST00000295049.9,5,2,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,70,KFYVNDILYLRNLNLYQNQ
3,ENST00000295049.9,5,2,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,70,KFYVNDILYLRNLNLYQNQ
4,ENST00000372836.5,3,1,CNPY3,ENSG00000137161.18_1,ENST00000372836.5;ENSG00000137161.18_1;CNPY3;4...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKGMSETFETLHNLVH...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKVGFGIVLHPLWGQA...,32,VGFGIVLHPLWGQACMYLSVSAGVSVI
5,ENST00000395891.7,11,2,HECW1,ENSG00000002746.15_3,ENST00000395891.7;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,451,
6,ENST00000453890.5,10,2,HECW1,ENSG00000002746.15_3,ENST00000453890.5;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,450,G
7,ENST00000457408.7,4,0,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,49,ASAYLLVI*
8,ENST00000457408.7,4,0,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,49,ASAYLLVI*
9,ENST00000328195.8,4,0,PLPBP,ENSG00000147471.12_1,ENST00000328195.8;ENSG00000147471.12_1;PLPBP;3...,ILSLCPEIKWHFIGHLQKQNVNKLMAVPNLFMLETVDSVKLADKVN...,ILSLCPEIKWHFIGHLQKQNVNKLMGKIKLNMKTKLFCHCIASTTL...,25,GKIKLNMKTKLFCHCIASTTLCGRESNF


In [87]:
peps_comb.drop(columns=["exon_number", "Frame", "longest_match_slice"]).rename(columns={"transcript_id": "ref_transcript_id",
                                                                                        "peptide_seq": "peptide_seq_skip"})

outcol_order = ["tx_cryp_id", "le_id", "gene_name", "ref_transcript_id", "peptide_seq_cryptic", "peptide_seq_cryptic_uniq", "peptide_seq_skip"]

Unnamed: 0,ref_transcript_id,gene_name,le_id,tx_cryp_id,peptide_seq_skip,peptide_seq_cryptic,peptide_seq_cryptic_uniq
0,ENST00000370630.6,CTBS,ENSG00000117151.13_1,ENST00000370630.6;ENSG00000117151.13_1;CTBS;84...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...,DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQ...,VRLFVSYEHLFY
1,ENST00000272907.8,NYAP2,ENSG00000144460.13_2,ENST00000272907.8;ENSG00000144460.13_2;NYAP2;2...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVACMQWFHGDHT...,PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVPSSLANRD,PSSLANRD
2,ENST00000295049.9,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,KFYVNDILYLRNLNLYQNQ
3,ENST00000295049.9,RFTN2,ENSG00000162944.11_1,ENST00000295049.9;ENSG00000162944.11_1;RFTN2;1...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLE...,KFYVNDILYLRNLNLYQNQ
4,ENST00000372836.5,CNPY3,ENSG00000137161.18_1,ENST00000372836.5;ENSG00000137161.18_1;CNPY3;4...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKGMSETFETLHNLVH...,DLRLIEVTETICKRLLDYSLHKERTGSNRFAKVGFGIVLHPLWGQA...,VGFGIVLHPLWGQACMYLSVSAGVSVI
5,ENST00000395891.7,HECW1,ENSG00000002746.15_3,ENST00000395891.7;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,
6,ENST00000453890.5,HECW1,ENSG00000002746.15_3,ENST00000453890.5;ENSG00000002746.15_3;HECW1;4...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQ...,G
7,ENST00000457408.7,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,ASAYLLVI*
8,ENST00000457408.7,TMED4,ENSG00000158604.15_1,ENST00000457408.7;ENSG00000158604.15_1;TMED4;4...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...,ASAYLLVI*
9,ENST00000328195.8,PLPBP,ENSG00000147471.12_1,ENST00000328195.8;ENSG00000147471.12_1;PLPBP;3...,ILSLCPEIKWHFIGHLQKQNVNKLMAVPNLFMLETVDSVKLADKVN...,ILSLCPEIKWHFIGHLQKQNVNKLMGKIKLNMKTKLFCHCIASTTL...,GKIKLNMKTKLFCHCIASTTLCGRESNF
