In [57]:
import pyranges as pr
import pandas as pd
from Bio.Seq import Seq



In [88]:
def translate_exon(gr: pr.PyRanges, fasta: str = "data/GRCh38.primary_assembly.genome.fa", drop_seqs: bool = True) -> pr.PyRanges:
    '''Generate a peptide sequence for a continuous genomic interval (e.g. exon)

    Parameters
    ----------
    gr : pr.PyRanges
        _description_
    fasta : str, optional
        path to FASTA file of genome sequence, by default "data/GRCh38.primary_assembly.genome.fa"
    drop_seqs : bool, optional
        whether to drop columns containing extracted genomic sequence, by default True

    Returns
    -------
    pr.PyRanges
        input PyRanges object with added 'peptide_seq' column containing str of peptide sequences
    '''

    assert "Frame" in gr.columns
    assert "dna_seq" not in gr.columns

    # extract genomic sequences for intervals, add as column
    seqs_gr = pr.get_sequence(gr, fasta)
    gr.dna_seq = seqs_gr

    # convert dna_seq to a Seq object to translate (BioPython)
    # remove nucleotides from start if necessary to get complete codon
    gr = gr.assign("seq_dna_seq",
                    lambda df: df.apply(lambda row: Seq(row["dna_seq"][int(row["Frame"]):]),
                                        axis=1)
                                        )
    
    # generate peptide sequence
    gr = gr.assign("peptide_seq",
                   lambda df: df["seq_dna_seq"].apply(lambda x: str(x.translate(to_stop=True)))
                   )

    if drop_seqs:
        return gr.drop(["dna_seq", "seq_dna_seq"])
    
    else:
        return gr
    
    



In [59]:
# First define set of IDs & last exons for which to extract peptide seqs
bleeds = pd.read_csv("data/riboseq_manual_verification_of_i3_cortical_cryptic_bleedthroughs.tsv", sep="\t", usecols=["gene_name", "le_id", "exper_cryp", "event_manual_validation", "notes"])

bleeds_y = bleeds[bleeds["event_manual_validation"] == "yes"]
bleeds_y


Unnamed: 0,gene_name,le_id,exper_cryp,event_manual_validation,notes
0,ACER3,ENSG00000078124.13_1,"brown_i3_cortical,seddighi_i3_cortical",yes,"coverage quite low, but reads consistently up ..."
1,ADCY8,ENSG00000155897.10_1,humphrey_i3_cortical,yes,seddighi - event looks real but v lowly expres...
2,ANKRD27,ENSG00000105186.16_3,"brown_i3_cortical,humphrey_i3_cortical,seddigh...",yes,
9,CEP76,ENSG00000101624.11_3,brown_i3_cortical,yes,"looks like real 3'UTR, but unclear whether cry..."
10,CNPY3,ENSG00000137161.18_1,"humphrey_i3_cortical,seddighi_i3_cortical",yes,reads in cryptic also downstream of stops in a...
11,CTBS,ENSG00000117151.13_1,humphrey_i3_cortical,yes,"genuine bleedthrough, not particularly cryptic..."
12,DLEU7,ENSG00000186047.11_2,humphrey_i3_cortical,yes,
14,FBXO38-DT,ENSG00000247199.6_1,humphrey_i3_cortical,yes,very lowly expressed
15,FIRRE,ENSG00000213468.7_1,brown_i3_cortical,yes,lncRNA. Looks real but lowly expressed
17,HECW1,ENSG00000002746.15_3,brown_i3_cortical,yes,definitely upregulated but also basal IR. Even...


In [60]:
%%time
# read in reference GTF (used to define last exons), subsetting only to cryptic-containing genes to save memory
ref_gtf = pr.read_gtf("data/reference_filtered.gtf").subset(lambda df: df.gene_name.isin(set(bleeds_y.gene_name)))
ref_gtf

CPU times: user 22 s, sys: 5.14 s, total: 27.1 s
Wall time: 1min 5s


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,hgnc_id,protein_id,ccdsid
0,chr1,HAVANA,transcript,21440127,21483467,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,,,HGNC:25076,ENSP00000415711.2,CCDS57977.1
1,chr1,HAVANA,exon,21440127,21440348,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,1,ENSE00001546346.2,HGNC:25076,ENSP00000415711.2,CCDS57977.1
2,chr1,HAVANA,exon,21444947,21445219,.,+,.,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
3,chr1,HAVANA,CDS,21445086,21445219,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
4,chr1,HAVANA,start_codon,21445086,21445089,.,+,0,ENSG00000142794.19,protein_coding,...,protein_coding,NBPF3-205,2.0,"basic,appris_alternative_2,CCDS",OTTHUMT00000476522.1,2,ENSE00003642335.1,HGNC:25076,ENSP00000415711.2,CCDS57977.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,chrX,HAVANA,exon,131755597,131755746,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,9,ENSE00001752569.1,HGNC:49627,,
1537,chrX,HAVANA,exon,131749305,131749458,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,10,ENSE00001674554.1,HGNC:49627,,
1538,chrX,HAVANA,exon,131711650,131711720,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,11,ENSE00001594498.1,HGNC:49627,,
1539,chrX,HAVANA,exon,131709497,131709556,.,-,.,ENSG00000213468.7,lncRNA,...,lncRNA,FIRRE-201,2.0,basic,OTTHUMT00000493690.2,12,ENSE00001734033.1,HGNC:49627,,


In [61]:
# read in last exon references
# 'quant last exons', which is unique regions of last exons only, will first be used to define 'annotated bleedthroughs'
quant_le = pr.read_gtf("data/novel_ref_combined.quant.last_exons.gtf")
full_le = pr.read_gtf("data/novel_ref_combined.last_exons.gtf")

# subset to bleedthroughs only
quant_le_bld = quant_le.subset(lambda df: df["le_id"].isin(set(bleeds_y["le_id"])))
full_le_bld =  full_le.subset(lambda df: df["le_id"].isin(set(bleeds_y["le_id"])))
quant_le

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,24419640,24420128,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24419290,24419640,ENST00000003912.7,488.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,1.0,ENSG00000001461.17_1
1,chr1,.,exon,24421813,24422110,.,+,.,ENSG00000001461.17,NIPAL3,...,last,244196402441964024419640,244401712444017124433103,"ENST00000358028.8,ENST00000374399.9,ENST000000...","NULL,NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,2.0,ENSG00000001461.17_2
2,chr1,.,exon,24454053,24454824,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2445350424453504,2445613724456137,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",internal_exon_spliced,ENSG00000001461.17,NIPAL3,3.0,ENSG00000001461.17_3
3,chr1,.,exon,24464120,24466378,.,+,.,ENSG00000001461.17,NIPAL3,...,last,24464025,24464120,ENST00000003912.7,2258.0,internal_exon_extension,ENSG00000001461.17,NIPAL3,4.0,ENSG00000001461.17_4
4,chr1,.,exon,24468985,24472976,.,+,.,ENSG00000001461.17,NIPAL3,...,last,2446412024464120,2446898524468985,"ENST00000003912.7,ENST00000374399.9","NULL,NULL",last_exon_spliced,ENSG00000001461.17,NIPAL3,5.0,ENSG00000001461.17_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129925,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
129926,chrY,.,exon,21880075,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
129927,chrY,.,exon,21880307,21880652,.,-,.,ENSG00000244395.6,RBMY1D,...,last,218806522188065221880652,218810632188106321881063,"ENST00000382653.6,ENST00000382680.5,ENST000004...","NULL,NULL,NULL",last_exon_spliced,ENSG00000244395.6,RBMY1D,1.0,ENSG00000244395.6_1
129928,chrY,.,exon,21038288,21039044,.,-,.,ENSG00000254488.1,ENSG00000254488,...,last,21039044,21042268,ENST00000527562.1,,last_exon_spliced,ENSG00000254488.1,ENSG00000254488,1.0,ENSG00000254488.1_1


## Step 1 - identify bleedthroughs that have an overlapping/annotated CDS

In some cases, bleedthrough events are defined by the pipeline as novel extensions of annotated last exons (e.g. SIN3B). These events will already have a CDS annotated for the coding region of the last exon, so can use that to define the cryptic peptide

Strategy:
1. Overlap 'quantification' last exons with CDSs, checking that the parent transcripts match
2. Keep these CDSs for these bleedthroughs

In [62]:
# CDS and stop codons are annotated separately in GTFs - still want to report full coding region for bleedthrough including stop codon
# merge CDS & stop codons into single interval by transcript ID
ref_gtf_cds = ref_gtf.subset(lambda df: df.Feature == "CDS")
ref_gtf_sc = ref_gtf.subset(lambda df: df.Feature == "stop_codon")
# ref_gtf_cds_m = pr.concat([ref_gtf_cds, ref_gtf_sc]).merge(strand=True, by="transcript_id")
# ref_gtf_cds_m


In [63]:
bld_cds_anno = quant_le_bld[["ref_gene_name", "le_id", "transcript_id", "transcript_id_ref"]].join(ref_gtf_cds[["gene_name", "gene_id", "transcript_id", "Frame"]], strandedness="same", suffix="_cds")
bld_cds_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_cds,End_cds,Strand_cds,gene_name,gene_id,transcript_id_cds,Frame
0,chr7,44579896,44581092,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44581068,44581239,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
1,chr7,44579903,44581092,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44581068,44581239,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
2,chr13,50814965,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50823316,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
3,chr13,50815080,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50823316,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
4,chr18,12672624,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
5,chr18,12672624,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
6,chr18,12673096,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
7,chr18,12673096,12673387,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673367,12673503,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
8,chr19,16854261,16857420,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854142,16854262,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0
9,chr19,32635458,32639355,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32639304,32639488,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1


In [64]:
# Make the CDS coordinates the main coordinate range
cds_bld_anno = bld_cds_anno.new_position("swap").apply(lambda df: df.rename(columns={"Start_cds": "Start_le", "End_cds": "End_le", "Strand_cds": "Strand_le"}))
# as CDS entries lack the stop codon, extend by 3nt in each case to add in stop codon
cds_bld_anno = cds_bld_anno.extend({"3": 3})
cds_bld_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1


In [65]:
# Extract sequence for CDS sequences
seqs_cds_bld_anno = pr.get_sequence(cds_bld_anno, "data/GRCh38.primary_assembly.genome.fa")
cds_bld_anno.dna_seq = seqs_cds_bld_anno

cds_bld_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame,dna_seq
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0,GTCCGCCGGGTGCTGAAGAGCCAGGAGGTGTATGAAAACTTCCTCC...
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1,GATGGCAAATTTGAGTTACATCAAAAACTTCAGGTTTAGCAGCTTG...


### Translating CDSs

Minimal information needed is:
- DNA sequence
- Frame - how many bases to remove from start of sequence to get the first base of a codon

To translate all (or all but the first incomplete codon) codons, will need to ensure DNA sequence starts at a complete codon. From there on it should be a simple translation.
- Convert to biopython Seq object (stripping frame positions from start of dna_sequence)
- seq.translate()
- convert back to string

In [66]:
cds_bld_anno = cds_bld_anno.assign("seq_dna_seq",
                    lambda df: df.apply(lambda row: Seq(row["dna_seq"][int(row["Frame"]):]),
                                        axis=1)
                                        )

cds_bld_anno
                                                            

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame,dna_seq,seq_dna_seq
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ..."
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ..."
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ..."
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ..."
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ..."
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0,GTCCGCCGGGTGCTGAAGAGCCAGGAGGTGTATGAAAACTTCCTCC...,"(G, T, C, C, G, C, C, G, G, G, T, G, C, T, G, ..."
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1,GATGGCAAATTTGAGTTACATCAAAAACTTCAGGTTTAGCAGCTTG...,"(A, T, G, G, C, A, A, A, T, T, T, G, A, G, T, ..."


In [67]:
cds_bld_anno.peptide_seq = cds_bld_anno.seq_dna_seq.apply(lambda x: str(x.translate()))
cds_bld_anno

Unnamed: 0,Chromosome,Start,End,Strand,ref_gene_name,le_id,transcript_id,transcript_id_ref,Start_le,End_le,Strand_le,gene_name,gene_id,transcript_id_cds,Frame,dna_seq,seq_dna_seq,peptide_seq
0,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000481238.1,ENST00000457408.7,44579896,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ...",RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
1,chr7,44581065,44581239,-,TMED4,ENSG00000158604.15_1,ENST00000477639.5,ENST00000457408.7,44579903,44581092,-,TMED4,ENSG00000158604.15,ENST00000481238.1,0,CGGGTGCATCTCGACATCCAGGTTGGGGAGCATGCCAACAACTACC...,"(C, G, G, G, T, G, C, A, T, C, T, C, G, A, C, ...",RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQD...
2,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.ctrl_ctrl_2.18282.3,,50814965,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ...",DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANF...
3,chr13,50823313,50823520,-,DLEU7,ENSG00000186047.11_2,PAPA.TDP43_ctrl_3.17822.2,,50815080,50823520,-,DLEU7,ENSG00000186047.11,ENST00000504404.2,0,GATAGTGTTGAGTTTAGAAACATCTGCAGTCATTTGGCTCTACAGA...,"(G, A, T, A, G, T, G, T, T, G, A, G, T, T, T, ...",DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANF...
4,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
5,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000262127.7,ENST00000590143.5,12672624,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
6,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000262127.7,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
7,chr18,12673364,12673503,-,CEP76,ENSG00000101624.11_3,ENST00000423709.6,ENST00000590143.5,12673096,12673387,-,CEP76,ENSG00000101624.11,ENST00000423709.6,1,ATCTCCTTTCTGTGAAGAAATAATCTGTTGCCGTGGAGACCAAGTG...,"(T, C, T, C, C, T, T, T, C, T, G, T, G, A, A, ...",SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
8,chr19,16854142,16854265,+,SIN3B,ENSG00000127511.10_2,PAPA.ctrl_ctrl_2.24194.4,,16854261,16857420,+,SIN3B,ENSG00000127511.10,ENST00000596802.5,0,GTCCGCCGGGTGCTGAAGAGCCAGGAGGTGTATGAAAACTTCCTCC...,"(G, T, C, C, G, C, C, G, G, G, T, G, C, T, G, ...",VRRVLKSQEVYENFLRCIALFNQELVSGSELLQLVSPFLG*
9,chr19,32639301,32639488,-,ANKRD27,ENSG00000105186.16_3,PAPA.TDP43_19065411_S54.22523.3,,32635458,32639355,-,ANKRD27,ENSG00000105186.16,ENST00000587352.5,1,GATGGCAAATTTGAGTTACATCAAAAACTTCAGGTTTAGCAGCTTG...,"(A, T, G, G, C, A, A, A, T, T, T, G, A, G, T, ...",MANLSYIKNFRFSSLAKDELGYCLTSFEAAIEYIRQGSLSAKPPVR...


In [68]:
for _, df in cds_bld_anno.as_df().iterrows():
    print("\t".join(df[["gene_name", "peptide_seq"]]))

TMED4	RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQDYQRASAYLLVI*
TMED4	RVHLDIQVGEHANNYPEIAAKDKLTELQLRARQLLDQVEQIQKEQDYQRASAYLLVI*
DLEU7	DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANFPSDAHMVACASLRQILQNLPDI*
DLEU7	DSVEFRNICSHLALQIEGQQFDRDLNAAHQCLKTIVKKLIQSLANFPSDAHMVACASLRQILQNLPDI*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
CEP76	SPFCEEIICCRGDQVRLAVRVRVFTYPESACAVWIMFACKYRSVL*
SIN3B	VRRVLKSQEVYENFLRCIALFNQELVSGSELLQLVSPFLG*
ANKRD27	MANLSYIKNFRFSSLAKDELGYCLTSFEAAIEYIRQGSLSAKPPVRSHPCPGLPLWASWFP*




I manually checked the above and they are correct - now to define full cryptic peptides for those with no annotated CDS overlapping the unique region. To do this:
1. set of le_ids with overlapping/already defined peptides
2. For remaining les, take the full last exon sequences and find overlapping annotated CDSs. Make sure they exactly match at the 5'ends of the exon
    - This gives us the frame/phase for the exon 
3. Extract sequence and translate as before

## Translating last exons without an annotated CDS extending into unique region

In [82]:
# get full coordinates of bleedthough last exons without an annotated peptide 
full_le_bld_nol = full_le_bld.subset(lambda df: ~df["le_id"].isin(set(cds_bld_anno.le_id))).drop_duplicate_positions(strand=True)
full_le_bld_nol

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,,,,,,last_exon_extension,ENSG00000142794.19,NBPF3,3.0,ENSG00000142794.19_3
1,chr1,.,exon,84561927,84563418,.,-,.,PAPA.ctrl_ctrl_1.1091,,...,,,,,,internal_exon_extension,ENSG00000117151.13,CTBS,1.0,ENSG00000117151.13_1
2,chr2,.,exon,225651431,225658565,.,+,.,PAPA.TDP43_ctrl_3.4451,,...,,,,,,internal_exon_extension,ENSG00000144460.13,NYAP2,2.0,ENSG00000144460.13_2
3,chr2,.,exon,197630416,197631220,.,-,.,PAPA.TDP43_ctrl_4.4119,,...,,,,,,internal_exon_extension,ENSG00000162944.11,RFTN2,1.0,ENSG00000162944.11_1
4,chr2,.,exon,197630635,197631220,.,-,.,PAPA.TDP-4.4038,,...,,,,,,internal_exon_extension,ENSG00000162944.11,RFTN2,1.0,ENSG00000162944.11_1
5,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43_19065413_S19.7930,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
6,chr5,.,exon,148377695,148378845,.,-,.,PAPA.Cont-D_S4.6266,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
7,chr5,.,exon,148378366,148378845,.,-,.,PAPA.TDP43-E_S5.6788,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
8,chr6,.,exon,42935573,42936378,.,+,.,PAPA.TDP-4.8982,,...,,,,,,internal_exon_extension,ENSG00000137161.18,CNPY3,1.0,ENSG00000137161.18_1
9,chr7,.,exon,43444214,43449982,.,+,.,PAPA.TDP43-G_S7.7990,,...,,,,,,internal_exon_spliced,ENSG00000002746.15,"HECW1,HECW1",3.0,ENSG00000002746.15_3


In [83]:
# overlap join last exons with CDSs, filtering for exact 5'matches
cds_bld_anno_nol = (full_le_bld_nol[["gene_name", "le_id", "transcript_id", "transcript_id_ref", "ref_gene_name"]].join(ref_gtf_cds[["gene_name", "gene_id", "transcript_id", "Frame"]],
                                                                                                     strandedness="same",
                                                                                                     suffix="_cds")
.subset(lambda df: ((df.Strand == "+") & (df.Start == df.Start_cds)) | ((df.Strand == "-") & (df.End == df.End_cds))
        )
)

cds_bld_anno_nol

Unnamed: 0,Chromosome,Start,End,Strand,gene_name,le_id,transcript_id,transcript_id_ref,ref_gene_name,Start_cds,End_cds,Strand_cds,gene_name_cds,gene_id,transcript_id_cds,Frame
0,chr1,84561927,84563418,-,,ENSG00000117151.13_1,PAPA.ctrl_ctrl_1.1091.9,,CTBS,84563256,84563418,-,CTBS,ENSG00000117151.13,ENST00000370630.6,0
1,chr2,225651431,225658565,+,,ENSG00000144460.13_2,PAPA.TDP43_ctrl_3.4451.2,,NYAP2,225651431,225651535,+,NYAP2,ENSG00000144460.13,ENST00000272907.8,2
2,chr2,197630416,197631220,-,,ENSG00000162944.11_1,PAPA.TDP43_ctrl_4.4119.3,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
3,chr2,197630635,197631220,-,,ENSG00000162944.11_1,PAPA.TDP-4.4038.5,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
4,chr6,42935573,42936378,+,,ENSG00000137161.18_1,PAPA.TDP-4.8982.1,,CNPY3,42935573,42935670,+,CNPY3,ENSG00000137161.18,ENST00000372836.5,1
5,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000453890.5,2
6,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000395891.7,2
7,chr8,37766279,37766741,+,PLPBP,ENSG00000147471.12_1,ENST00000520073.5,ENST00000328195.8,PLPBP,37766279,37766355,+,PLPBP,ENSG00000147471.12,ENST00000328195.8,0
8,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000286355.10,1
9,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000377928.7,1


In [84]:
# overlap join last exons with CDSs, filtering for exact 5'matches
cds_bld_anno_nol = (full_le_bld_nol[["gene_name", "le_id", "transcript_id", "transcript_id_ref", "ref_gene_name"]].join(ref_gtf_cds[["gene_name", "gene_id", "transcript_id", "Frame"]],
                                                                                                     strandedness="same",
                                                                                                     suffix="_cds")
.subset(lambda df: ((df.Strand == "+") & (df.Start == df.Start_cds)) | ((df.Strand == "-") & (df.End == df.End_cds))
        )
)

cds_bld_anno_nol

Unnamed: 0,Chromosome,Start,End,Strand,gene_name,le_id,transcript_id,transcript_id_ref,ref_gene_name,Start_cds,End_cds,Strand_cds,gene_name_cds,gene_id,transcript_id_cds,Frame
0,chr1,84561927,84563418,-,,ENSG00000117151.13_1,PAPA.ctrl_ctrl_1.1091.9,,CTBS,84563256,84563418,-,CTBS,ENSG00000117151.13,ENST00000370630.6,0
1,chr2,225651431,225658565,+,,ENSG00000144460.13_2,PAPA.TDP43_ctrl_3.4451.2,,NYAP2,225651431,225651535,+,NYAP2,ENSG00000144460.13,ENST00000272907.8,2
2,chr2,197630416,197631220,-,,ENSG00000162944.11_1,PAPA.TDP43_ctrl_4.4119.3,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
3,chr2,197630635,197631220,-,,ENSG00000162944.11_1,PAPA.TDP-4.4038.5,,RFTN2,197631010,197631220,-,RFTN2,ENSG00000162944.11,ENST00000295049.9,2
4,chr6,42935573,42936378,+,,ENSG00000137161.18_1,PAPA.TDP-4.8982.1,,CNPY3,42935573,42935670,+,CNPY3,ENSG00000137161.18,ENST00000372836.5,1
5,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000453890.5,2
6,chr7,43444217,43448981,+,,ENSG00000002746.15_3,PAPA.TDP43_ctrl_2.10456.1,,HECW1,43444217,43445570,+,HECW1,ENSG00000002746.15,ENST00000395891.7,2
7,chr8,37766279,37766741,+,PLPBP,ENSG00000147471.12_1,ENST00000520073.5,ENST00000328195.8,PLPBP,37766279,37766355,+,PLPBP,ENSG00000147471.12,ENST00000328195.8,0
8,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000286355.10,1
9,chr8,130899857,130904042,-,,ENSG00000155897.10_1,PAPA.TDP43_ctrl_3.12351.12,,ADCY8,130903771,130904042,-,ADCY8,ENSG00000155897.10,ENST00000377928.7,1


In [90]:
# Now have full last exons with exactly matched annotated CDS (and frame to get complete CDS)
cds_bld_anno_nol = translate_exon(cds_bld_anno_nol)

for _, df in cds_bld_anno_nol.as_df().drop_duplicates(subset=["ref_gene_name", "peptide_seq"]).iterrows():
    print("\t".join(df[["ref_gene_name", "peptide_seq"]]))


CTBS	DHVCTIAKVPFRGAPCSDAAGRQVPYKTIMKQINSSISGNLWDKDQRAPYYNYKVRLFVSYEHLFY
NYAP2	PKVSCKLGRSASTSGVPPPSVTPLRQSSDLQQSQVPSSLANRD
RFTN2	SDNKLYTVFNAFDDDSTSWAYQEGILSMKVTRKGSVISTLDADWLELTTFYYKQGLSLIDSFVFWETSKGKFYVNDILYLRNLNLYQNQ
CNPY3	DLRLIEVTETICKRLLDYSLHKERTGSNRFAKVGFGIVLHPLWGQACMYLSVSAGVSVI
HECW1	DEEISLSTEPESAQIQDSPMNNLMESGSGEPRSEAPESSESWKPEQLGEGSVPDGPGNQSIELSRPAEEAAVITEAGDQGMVSVGPEGAGELLAQVQKDIQPAPSAEELAEQLDLGEEASALLLEDGEAPASTKEEPLEEEATTQSRAGREEEEKEQEEEGDVSTLEQGEGRLQLRASVKRKSRPCSLPVSELETVIASACGDPETPRTHYIRIHTLLHSMPSAQGGSAAEEEDGAEEESTLKDSSEKDGLSEVDTVAADPSALEEDREEPEGATPGTAHPGHSGGHFPSLANGAAQDGDTHPSTGSESDSSPRQGGDHSCEGCDASCCSPSCYSSSCYSTSCYSSSCYSASCYSPSCYNGNRFASHTRFSSVDSAKISESTVFSSQDDEEEENSAFESVPDSMQSPELDPESTNGAGPWQDELAAPSGHVERSPEGLESPVAGPSNRREG
PLPBP	ILSLCPEIKWHFIGHLQKQNVNKLMGKIKLNMKTKLFCHCIASTTLCGRESNF
ADCY8	RIHISKATLDCLNGDYNVEEGHGKERNEFLRKHNIETYLIKQPEDSLLSLPEDIVKESVSSSDRRNSGATFTEGSWSPELPFDNIVGKQNVSPLFLLLLGHECMLVCISNPDTSEKTDSSLTFSPMNTIIANDQTISLSPQDRINLVCNFRKCQKCQRVPNTLSLSLFFFL
ACER3	SFLPSSLKSNLKLV



Manually checking a few:
- USP31 - There is a TSL5 transcript so can cross-check, is spot on! There is possibly a shortened one without a stop codon :O
- CNPY3 - also correct (refseq has extension partially annotated)
- ACER3 - also correct
- ADCY8 - also correct



In [94]:
# which ones are missing?
full_le_bld_nol.subset(lambda df: ~df.le_id.isin(cds_bld_anno_nol.le_id))

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,region_rank,Start_ref,End_ref,transcript_id_ref,3p_extension_length,event_type,ref_gene_id,ref_gene_name,le_number,le_id
0,chr1,.,exon,21453372,21457150,.,+,.,PAPA.TDP-1.345,,...,,,,,,last_exon_extension,ENSG00000142794.19,NBPF3,3.0,ENSG00000142794.19_3
1,chr5,.,exon,148377588,148378845,.,-,.,PAPA.TDP43_19065413_S19.7930,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
2,chr5,.,exon,148377695,148378845,.,-,.,PAPA.Cont-D_S4.6266,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
3,chr5,.,exon,148378366,148378845,.,-,.,PAPA.TDP43-E_S5.6788,,...,,,,,,internal_exon_extension,ENSG00000247199.6,FBXO38-DT,1.0,ENSG00000247199.6_1
4,chr10,.,exon,89330996,89335534,.,-,.,PAPA.ctrl_ctrl_4.14569,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
5,chr10,.,exon,89331008,89335534,.,-,.,PAPA.TDP43_19065403_S23.13899,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
6,chr10,.,exon,89332056,89335534,.,-,.,PAPA.TDP-6.13417,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
7,chr10,.,exon,89334602,89335534,.,-,.,PAPA.TDP43-G_S7.10969,,...,,,,,,last_exon_extension,ENSG00000107798.18,LIPA,2.0,ENSG00000107798.18_2
8,chrX,.,exon,91882906,91891321,.,+,.,PAPA.TDP43_ctrl_4.26532,,...,,,,,,last_exon_extension,ENSG00000102290.23,PCDH11X,3.0,ENSG00000102290.23_3
9,chrX,.,exon,131823775,131825365,.,-,.,PAPA.TDP43-F_S6.20505,,...,,,,,,internal_exon_extension,ENSG00000213468.7,FIRRE,1.0,ENSG00000213468.7_1


FIRRE is definitely a lncRNA, the others not so sure (TO CHECK ref GTF) - maybe they don't have any annotated CDS if protein-coding?

### Extracting unique regions of peptides

In [50]:
# function to match two peptide sequences & find position where they stop matching (assuming they start at the same position)
def longest_matching_substring(str1: str, str2: str) -> int:
    '''Return the index of the final position in longest exactly matched substring from the beginning of two strings

    this is intended for strings that begin with identical values. it only checks the longest matching substring from the first position in each string. if there is one mismatch, the function terminates
    

    Parameters
    ----------
    str1 : str
        first string of pair wish to find the longest matching substring from the beginning of the string
    str2 : str
        second string of pair wish to find the longest matching substring from the beginning of the string

    Returns
    -------
    int
        index of the final position (in either string) of the longest matched substring between the two. If you wish to slice a string to retain/exclude the longest match, remember to add one to returned value
    '''
    longest_substring = 0 
    for i, (char1, char2) in enumerate(zip(str1, str2)):
        if char1 == char2:
            longest_substring = i
        else:
            break

    return longest_substring 

string1 = "abcdefg"
string2 = "abcxyz"

print("Testing with strings:", string1, string2)
print("Original function result:", longest_matching_substring(string1, string2))
print("Original function result extracting matched string1:", string1[:longest_matching_substring(string1, string2) + 1])
print("Original function result extracting unique part of string1:", string1[longest_matching_substring(string1, string2) + 1:])


Testing with strings: abcdefg abcxyz
Original function result: 2
Original function result extracting matched string1: abc
Original function result extracting unique part of string1: defg
